nv50_program.c revision 3accd7ebe971624bed5624f73ed3522c9de4c193
1/* 2 * Copyright 2008 Ben Skeggs 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF 19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 * SOFTWARE. 21 */ 22 23#include "pipe/p_context.h" 24#include "pipe/p_defines.h" 25#include "pipe/p_state.h" 26#include "pipe/p_inlines.h" 27 28#include "pipe/p_shader_tokens.h" 29#include "tgsi/tgsi_parse.h" 30#include "tgsi/tgsi_util.h" 31 32#include "nv50_context.h" 33 34#define NV50_SU_MAX_TEMP 64 35//#define NV50_PROGRAM_DUMP 36 37/* ARL - gallium craps itself on progs/vp/arl.txt 38 * 39 * MSB - Like MAD, but MUL+SUB 40 * - Fuck it off, introduce a way to negate args for ops that 41 * support it. 42 * 43 * Look into inlining IMMD for ops other than MOV (make it general?) 44 * - Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD, 45 * but can emit to P_TEMP first - then MOV later. NVIDIA does this 46 * 47 * In ops such as ADD it's possible to construct a bad opcode in the !is_long() 48 * case, if the emit_src() causes the inst to suddenly become long. 49 * 50 * Verify half-insns work where expected - and force disable them where they 51 * don't work - MUL has it forcibly disabled atm as it fixes POW.. 52 * 53 * FUCK! watch dst==src vectors, can overwrite components that are needed. 54 * ie. SUB R0, R0.yzxw, R0 55 * 56 * Things to check with renouveau: 57 * FP attr/result assignment - how? 58 * attrib 59 * - 0x16bc maps vp output onto fp hpos 60 * - 0x16c0 maps vp output onto fp col0 61 * result 62 * - colr always 0-3 63 * - depr always 4 64 * 0x16bc->0x16e8 --> some binding between vp/fp regs 65 * 0x16b8 --> VP output count 66 * 67 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005 68 * "MOV rcol.x, fcol.y" = 0x00000004 69 * 0x19a8 --> as above but 0x00000100 and 0x00000000 70 * - 0x00100000 used when KIL used 71 * 0x196c --> as above but 0x00000011 and 0x00000000 72 * 73 * 0x1988 --> 0xXXNNNNNN 74 * - XX == FP high something 75 */ 76struct nv50_reg { 77 enum { 78 P_TEMP, 79 P_ATTR, 80 P_RESULT, 81 P_CONST, 82 P_IMMD 83 } type; 84 int index; 85 86 int hw; 87 int neg; 88 89 int rhw; /* result hw for FP outputs, or interpolant index */ 90 int acc; /* instruction where this reg is last read (first insn == 1) */ 91}; 92 93struct nv50_pc { 94 struct nv50_program *p; 95 96 /* hw resources */ 97 struct nv50_reg *r_temp[NV50_SU_MAX_TEMP]; 98 99 /* tgsi resources */ 100 struct nv50_reg *temp; 101 int temp_nr; 102 struct nv50_reg *attr; 103 int attr_nr; 104 struct nv50_reg *result; 105 int result_nr; 106 struct nv50_reg *param; 107 int param_nr; 108 struct nv50_reg *immd; 109 float *immd_buf; 110 int immd_nr; 111 112 struct nv50_reg *temp_temp[16]; 113 unsigned temp_temp_nr; 114 115 unsigned interp_mode[32]; 116 /* perspective interpolation registers */ 117 struct nv50_reg *iv_p; 118 struct nv50_reg *iv_c; 119 120 /* current instruction and total number of insns */ 121 unsigned insn_cur; 122 unsigned insn_nr; 123 124 boolean allow32; 125}; 126 127static void 128alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg) 129{ 130 int i = 0; 131 132 if (reg->type == P_RESULT) { 133 if (pc->p->cfg.high_result < (reg->hw + 1)) 134 pc->p->cfg.high_result = reg->hw + 1; 135 } 136 137 if (reg->type != P_TEMP) 138 return; 139 140 if (reg->hw >= 0) { 141 /*XXX: do this here too to catch FP temp-as-attr usage.. 142 * not clean, but works */ 143 if (pc->p->cfg.high_temp < (reg->hw + 1)) 144 pc->p->cfg.high_temp = reg->hw + 1; 145 return; 146 } 147 148 if (reg->rhw != -1) { 149 /* try to allocate temporary with index rhw first */ 150 if (!(pc->r_temp[reg->rhw])) { 151 pc->r_temp[reg->rhw] = reg; 152 reg->hw = reg->rhw; 153 if (pc->p->cfg.high_temp < (reg->rhw + 1)) 154 pc->p->cfg.high_temp = reg->rhw + 1; 155 return; 156 } 157 /* make sure we don't get things like $r0 needs to go 158 * in $r1 and $r1 in $r0 159 */ 160 i = pc->result_nr * 4; 161 } 162 163 for (; i < NV50_SU_MAX_TEMP; i++) { 164 if (!(pc->r_temp[i])) { 165 pc->r_temp[i] = reg; 166 reg->hw = i; 167 if (pc->p->cfg.high_temp < (i + 1)) 168 pc->p->cfg.high_temp = i + 1; 169 return; 170 } 171 } 172 173 assert(0); 174} 175 176static struct nv50_reg * 177alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst) 178{ 179 struct nv50_reg *r; 180 int i; 181 182 if (dst && dst->type == P_TEMP && dst->hw == -1) 183 return dst; 184 185 for (i = 0; i < NV50_SU_MAX_TEMP; i++) { 186 if (!pc->r_temp[i]) { 187 r = CALLOC_STRUCT(nv50_reg); 188 r->type = P_TEMP; 189 r->index = -1; 190 r->hw = i; 191 r->rhw = -1; 192 pc->r_temp[i] = r; 193 return r; 194 } 195 } 196 197 assert(0); 198 return NULL; 199} 200 201/* Assign the hw of the discarded temporary register src 202 * to the tgsi register dst and free src. 203 */ 204static void 205assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 206{ 207 assert(src->index == -1 && src->hw != -1); 208 209 if (dst->hw != -1) 210 pc->r_temp[dst->hw] = NULL; 211 pc->r_temp[src->hw] = dst; 212 dst->hw = src->hw; 213 214 FREE(src); 215} 216 217/* release the hardware resource held by r */ 218static void 219release_hw(struct nv50_pc *pc, struct nv50_reg *r) 220{ 221 assert(r->type == P_TEMP); 222 if (r->hw == -1) 223 return; 224 225 assert(pc->r_temp[r->hw] == r); 226 pc->r_temp[r->hw] = NULL; 227 228 r->acc = 0; 229 if (r->index == -1) 230 FREE(r); 231} 232 233static void 234free_temp(struct nv50_pc *pc, struct nv50_reg *r) 235{ 236 if (r->index == -1) { 237 unsigned hw = r->hw; 238 239 FREE(pc->r_temp[hw]); 240 pc->r_temp[hw] = NULL; 241 } 242} 243 244static int 245alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx) 246{ 247 int i; 248 249 if ((idx + 4) >= NV50_SU_MAX_TEMP) 250 return 1; 251 252 if (pc->r_temp[idx] || pc->r_temp[idx + 1] || 253 pc->r_temp[idx + 2] || pc->r_temp[idx + 3]) 254 return alloc_temp4(pc, dst, idx + 1); 255 256 for (i = 0; i < 4; i++) { 257 dst[i] = CALLOC_STRUCT(nv50_reg); 258 dst[i]->type = P_TEMP; 259 dst[i]->index = -1; 260 dst[i]->hw = idx + i; 261 pc->r_temp[idx + i] = dst[i]; 262 } 263 264 return 0; 265} 266 267static void 268free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4]) 269{ 270 int i; 271 272 for (i = 0; i < 4; i++) 273 free_temp(pc, reg[i]); 274} 275 276static struct nv50_reg * 277temp_temp(struct nv50_pc *pc) 278{ 279 if (pc->temp_temp_nr >= 16) 280 assert(0); 281 282 pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL); 283 return pc->temp_temp[pc->temp_temp_nr++]; 284} 285 286static void 287kill_temp_temp(struct nv50_pc *pc) 288{ 289 int i; 290 291 for (i = 0; i < pc->temp_temp_nr; i++) 292 free_temp(pc, pc->temp_temp[i]); 293 pc->temp_temp_nr = 0; 294} 295 296static int 297ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w) 298{ 299 pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * r * sizeof(float)), 300 (pc->immd_nr + 1) * 4 * sizeof(float)); 301 pc->immd_buf[(pc->immd_nr * 4) + 0] = x; 302 pc->immd_buf[(pc->immd_nr * 4) + 1] = y; 303 pc->immd_buf[(pc->immd_nr * 4) + 2] = z; 304 pc->immd_buf[(pc->immd_nr * 4) + 3] = w; 305 306 return pc->immd_nr++; 307} 308 309static struct nv50_reg * 310alloc_immd(struct nv50_pc *pc, float f) 311{ 312 struct nv50_reg *r = CALLOC_STRUCT(nv50_reg); 313 unsigned hw; 314 315 for (hw = 0; hw < pc->immd_nr * 4; hw++) 316 if (pc->immd_buf[hw] == f) 317 break; 318 319 if (hw == pc->immd_nr * 4) 320 hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4; 321 322 r->type = P_IMMD; 323 r->hw = hw; 324 r->index = -1; 325 return r; 326} 327 328static struct nv50_program_exec * 329exec(struct nv50_pc *pc) 330{ 331 struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec); 332 333 e->param.index = -1; 334 return e; 335} 336 337static void 338emit(struct nv50_pc *pc, struct nv50_program_exec *e) 339{ 340 struct nv50_program *p = pc->p; 341 342 if (p->exec_tail) 343 p->exec_tail->next = e; 344 if (!p->exec_head) 345 p->exec_head = e; 346 p->exec_tail = e; 347 p->exec_size += (e->inst[0] & 1) ? 2 : 1; 348} 349 350static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *); 351 352static boolean 353is_long(struct nv50_program_exec *e) 354{ 355 if (e->inst[0] & 1) 356 return TRUE; 357 return FALSE; 358} 359 360static boolean 361is_immd(struct nv50_program_exec *e) 362{ 363 if (is_long(e) && (e->inst[1] & 3) == 3) 364 return TRUE; 365 return FALSE; 366} 367 368static INLINE void 369set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx, 370 struct nv50_program_exec *e) 371{ 372 set_long(pc, e); 373 e->inst[1] &= ~((0x1f << 7) | (0x3 << 12)); 374 e->inst[1] |= (pred << 7) | (idx << 12); 375} 376 377static INLINE void 378set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx, 379 struct nv50_program_exec *e) 380{ 381 set_long(pc, e); 382 e->inst[1] &= ~((0x3 << 4) | (1 << 6)); 383 e->inst[1] |= (idx << 4) | (on << 6); 384} 385 386static INLINE void 387set_long(struct nv50_pc *pc, struct nv50_program_exec *e) 388{ 389 if (is_long(e)) 390 return; 391 392 e->inst[0] |= 1; 393 set_pred(pc, 0xf, 0, e); 394 set_pred_wr(pc, 0, 0, e); 395} 396 397static INLINE void 398set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e) 399{ 400 if (dst->type == P_RESULT) { 401 set_long(pc, e); 402 e->inst[1] |= 0x00000008; 403 } 404 405 alloc_reg(pc, dst); 406 e->inst[0] |= (dst->hw << 2); 407} 408 409static INLINE void 410set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e) 411{ 412 unsigned val = fui(pc->immd_buf[imm->hw]); 413 414 set_long(pc, e); 415 /*XXX: can't be predicated - bits overlap.. catch cases where both 416 * are required and avoid them. */ 417 set_pred(pc, 0, 0, e); 418 set_pred_wr(pc, 0, 0, e); 419 420 e->inst[1] |= 0x00000002 | 0x00000001; 421 e->inst[0] |= (val & 0x3f) << 16; 422 e->inst[1] |= (val >> 6) << 2; 423} 424 425 426#define INTERP_LINEAR 0 427#define INTERP_FLAT 1 428#define INTERP_PERSPECTIVE 2 429#define INTERP_CENTROID 4 430 431/* interpolant index has been stored in dst->rhw */ 432static void 433emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv, 434 unsigned mode) 435{ 436 assert(dst->rhw != -1); 437 struct nv50_program_exec *e = exec(pc); 438 439 e->inst[0] |= 0x80000000; 440 set_dst(pc, dst, e); 441 e->inst[0] |= (dst->rhw << 16); 442 443 if (mode & INTERP_FLAT) { 444 e->inst[0] |= (1 << 8); 445 } else { 446 if (mode & INTERP_PERSPECTIVE) { 447 e->inst[0] |= (1 << 25); 448 alloc_reg(pc, iv); 449 e->inst[0] |= (iv->hw << 9); 450 } 451 452 if (mode & INTERP_CENTROID) 453 e->inst[0] |= (1 << 24); 454 } 455 456 emit(pc, e); 457} 458 459static void 460set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s, 461 struct nv50_program_exec *e) 462{ 463 set_long(pc, e); 464 465 e->param.index = src->hw; 466 e->param.shift = s; 467 e->param.mask = m << (s % 32); 468 469 e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22); 470} 471 472static void 473emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 474{ 475 struct nv50_program_exec *e = exec(pc); 476 477 e->inst[0] |= 0x10000000; 478 479 set_dst(pc, dst, e); 480 481 if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) { 482 set_immd(pc, src, e); 483 /*XXX: 32-bit, but steals part of "half" reg space - need to 484 * catch and handle this case if/when we do half-regs 485 */ 486 } else 487 if (src->type == P_IMMD || src->type == P_CONST) { 488 set_long(pc, e); 489 set_data(pc, src, 0x7f, 9, e); 490 e->inst[1] |= 0x20000000; /* src0 const? */ 491 } else { 492 if (src->type == P_ATTR) { 493 set_long(pc, e); 494 e->inst[1] |= 0x00200000; 495 } 496 497 alloc_reg(pc, src); 498 e->inst[0] |= (src->hw << 9); 499 } 500 501 if (is_long(e) && !is_immd(e)) { 502 e->inst[1] |= 0x04000000; /* 32-bit */ 503 e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */ 504 if (!(e->inst[1] & 0x20000000)) 505 e->inst[1] |= 0x00030000; /* "subsubop" 0xf */ 506 } else 507 e->inst[0] |= 0x00008000; 508 509 emit(pc, e); 510} 511 512static INLINE void 513emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f) 514{ 515 struct nv50_reg *imm = alloc_immd(pc, f); 516 emit_mov(pc, dst, imm); 517 FREE(imm); 518} 519 520static boolean 521check_swap_src_0_1(struct nv50_pc *pc, 522 struct nv50_reg **s0, struct nv50_reg **s1) 523{ 524 struct nv50_reg *src0 = *s0, *src1 = *s1; 525 526 if (src0->type == P_CONST) { 527 if (src1->type != P_CONST) { 528 *s0 = src1; 529 *s1 = src0; 530 return TRUE; 531 } 532 } else 533 if (src1->type == P_ATTR) { 534 if (src0->type != P_ATTR) { 535 *s0 = src1; 536 *s1 = src0; 537 return TRUE; 538 } 539 } 540 541 return FALSE; 542} 543 544static void 545set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 546{ 547 if (src->type == P_ATTR) { 548 set_long(pc, e); 549 e->inst[1] |= 0x00200000; 550 } else 551 if (src->type == P_CONST || src->type == P_IMMD) { 552 struct nv50_reg *temp = temp_temp(pc); 553 554 emit_mov(pc, temp, src); 555 src = temp; 556 } 557 558 alloc_reg(pc, src); 559 e->inst[0] |= (src->hw << 9); 560} 561 562static void 563set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 564{ 565 if (src->type == P_ATTR) { 566 struct nv50_reg *temp = temp_temp(pc); 567 568 emit_mov(pc, temp, src); 569 src = temp; 570 } else 571 if (src->type == P_CONST || src->type == P_IMMD) { 572 assert(!(e->inst[0] & 0x00800000)); 573 if (e->inst[0] & 0x01000000) { 574 struct nv50_reg *temp = temp_temp(pc); 575 576 emit_mov(pc, temp, src); 577 src = temp; 578 } else { 579 set_data(pc, src, 0x7f, 16, e); 580 e->inst[0] |= 0x00800000; 581 } 582 } 583 584 alloc_reg(pc, src); 585 e->inst[0] |= (src->hw << 16); 586} 587 588static void 589set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 590{ 591 set_long(pc, e); 592 593 if (src->type == P_ATTR) { 594 struct nv50_reg *temp = temp_temp(pc); 595 596 emit_mov(pc, temp, src); 597 src = temp; 598 } else 599 if (src->type == P_CONST || src->type == P_IMMD) { 600 assert(!(e->inst[0] & 0x01000000)); 601 if (e->inst[0] & 0x00800000) { 602 struct nv50_reg *temp = temp_temp(pc); 603 604 emit_mov(pc, temp, src); 605 src = temp; 606 } else { 607 set_data(pc, src, 0x7f, 32+14, e); 608 e->inst[0] |= 0x01000000; 609 } 610 } 611 612 alloc_reg(pc, src); 613 e->inst[1] |= (src->hw << 14); 614} 615 616static void 617emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 618 struct nv50_reg *src1) 619{ 620 struct nv50_program_exec *e = exec(pc); 621 622 e->inst[0] |= 0xc0000000; 623 624 if (!pc->allow32) 625 set_long(pc, e); 626 627 check_swap_src_0_1(pc, &src0, &src1); 628 set_dst(pc, dst, e); 629 set_src_0(pc, src0, e); 630 if (src1->type == P_IMMD && !is_long(e)) 631 set_immd(pc, src1, e); 632 else 633 set_src_1(pc, src1, e); 634 635 emit(pc, e); 636} 637 638static void 639emit_add(struct nv50_pc *pc, struct nv50_reg *dst, 640 struct nv50_reg *src0, struct nv50_reg *src1) 641{ 642 struct nv50_program_exec *e = exec(pc); 643 644 e->inst[0] |= 0xb0000000; 645 646 if (!pc->allow32) 647 set_long(pc, e); 648 649 check_swap_src_0_1(pc, &src0, &src1); 650 set_dst(pc, dst, e); 651 set_src_0(pc, src0, e); 652 if (is_long(e) || src1->type == P_CONST || src1->type == P_ATTR) 653 set_src_2(pc, src1, e); 654 else 655 if (src1->type == P_IMMD) 656 set_immd(pc, src1, e); 657 else 658 set_src_1(pc, src1, e); 659 660 emit(pc, e); 661} 662 663static void 664emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst, 665 struct nv50_reg *src0, struct nv50_reg *src1) 666{ 667 struct nv50_program_exec *e = exec(pc); 668 669 set_long(pc, e); 670 e->inst[0] |= 0xb0000000; 671 e->inst[1] |= (sub << 29); 672 673 check_swap_src_0_1(pc, &src0, &src1); 674 set_dst(pc, dst, e); 675 set_src_0(pc, src0, e); 676 set_src_1(pc, src1, e); 677 678 emit(pc, e); 679} 680 681static void 682emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 683 struct nv50_reg *src1) 684{ 685 struct nv50_program_exec *e = exec(pc); 686 687 e->inst[0] |= 0xb0000000; 688 689 set_long(pc, e); 690 if (check_swap_src_0_1(pc, &src0, &src1)) 691 e->inst[1] |= 0x04000000; 692 else 693 e->inst[1] |= 0x08000000; 694 695 set_dst(pc, dst, e); 696 set_src_0(pc, src0, e); 697 set_src_2(pc, src1, e); 698 699 emit(pc, e); 700} 701 702static void 703emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 704 struct nv50_reg *src1, struct nv50_reg *src2) 705{ 706 struct nv50_program_exec *e = exec(pc); 707 708 e->inst[0] |= 0xe0000000; 709 710 check_swap_src_0_1(pc, &src0, &src1); 711 set_dst(pc, dst, e); 712 set_src_0(pc, src0, e); 713 set_src_1(pc, src1, e); 714 set_src_2(pc, src2, e); 715 716 emit(pc, e); 717} 718 719static void 720emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 721 struct nv50_reg *src1, struct nv50_reg *src2) 722{ 723 struct nv50_program_exec *e = exec(pc); 724 725 e->inst[0] |= 0xe0000000; 726 set_long(pc, e); 727 e->inst[1] |= 0x08000000; /* src0 * src1 - src2 */ 728 729 check_swap_src_0_1(pc, &src0, &src1); 730 set_dst(pc, dst, e); 731 set_src_0(pc, src0, e); 732 set_src_1(pc, src1, e); 733 set_src_2(pc, src2, e); 734 735 emit(pc, e); 736} 737 738static void 739emit_flop(struct nv50_pc *pc, unsigned sub, 740 struct nv50_reg *dst, struct nv50_reg *src) 741{ 742 struct nv50_program_exec *e = exec(pc); 743 744 e->inst[0] |= 0x90000000; 745 if (sub) { 746 set_long(pc, e); 747 e->inst[1] |= (sub << 29); 748 } 749 750 set_dst(pc, dst, e); 751 set_src_0(pc, src, e); 752 753 emit(pc, e); 754} 755 756static void 757emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 758{ 759 struct nv50_program_exec *e = exec(pc); 760 761 e->inst[0] |= 0xb0000000; 762 763 set_dst(pc, dst, e); 764 set_src_0(pc, src, e); 765 set_long(pc, e); 766 e->inst[1] |= (6 << 29) | 0x00004000; 767 768 emit(pc, e); 769} 770 771static void 772emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 773{ 774 struct nv50_program_exec *e = exec(pc); 775 776 e->inst[0] |= 0xb0000000; 777 778 set_dst(pc, dst, e); 779 set_src_0(pc, src, e); 780 set_long(pc, e); 781 e->inst[1] |= (6 << 29); 782 783 emit(pc, e); 784} 785 786static void 787emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst, 788 struct nv50_reg *src0, struct nv50_reg *src1) 789{ 790 struct nv50_program_exec *e = exec(pc); 791 unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; 792 struct nv50_reg *rdst; 793 794 assert(c_op <= 7); 795 if (check_swap_src_0_1(pc, &src0, &src1)) 796 c_op = inv_cop[c_op]; 797 798 rdst = dst; 799 if (dst->type != P_TEMP) 800 dst = alloc_temp(pc, NULL); 801 802 /* set.u32 */ 803 set_long(pc, e); 804 e->inst[0] |= 0xb0000000; 805 e->inst[1] |= (3 << 29); 806 e->inst[1] |= (c_op << 14); 807 /*XXX: breaks things, .u32 by default? 808 * decuda will disasm as .u16 and use .lo/.hi regs, but this 809 * doesn't seem to match what the hw actually does. 810 inst[1] |= 0x04000000; << breaks things.. .u32 by default? 811 */ 812 set_dst(pc, dst, e); 813 set_src_0(pc, src0, e); 814 set_src_1(pc, src1, e); 815 emit(pc, e); 816 817 /* cvt.f32.u32 */ 818 e = exec(pc); 819 e->inst[0] = 0xa0000001; 820 e->inst[1] = 0x64014780; 821 set_dst(pc, rdst, e); 822 set_src_0(pc, dst, e); 823 emit(pc, e); 824 825 if (dst != rdst) 826 free_temp(pc, dst); 827} 828 829static void 830emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 831{ 832 struct nv50_program_exec *e = exec(pc); 833 834 e->inst[0] = 0xa0000000; /* cvt */ 835 set_long(pc, e); 836 e->inst[1] |= (6 << 29); /* cvt */ 837 e->inst[1] |= 0x08000000; /* integer mode */ 838 e->inst[1] |= 0x04000000; /* 32 bit */ 839 e->inst[1] |= ((0x1 << 3)) << 14; /* .rn */ 840 e->inst[1] |= (1 << 14); /* src .f32 */ 841 set_dst(pc, dst, e); 842 set_src_0(pc, src, e); 843 844 emit(pc, e); 845} 846 847static void 848emit_pow(struct nv50_pc *pc, struct nv50_reg *dst, 849 struct nv50_reg *v, struct nv50_reg *e) 850{ 851 struct nv50_reg *temp = alloc_temp(pc, NULL); 852 853 emit_flop(pc, 3, temp, v); 854 emit_mul(pc, temp, temp, e); 855 emit_preex2(pc, temp, temp); 856 emit_flop(pc, 6, dst, temp); 857 858 free_temp(pc, temp); 859} 860 861static void 862emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 863{ 864 struct nv50_program_exec *e = exec(pc); 865 866 e->inst[0] = 0xa0000000; /* cvt */ 867 set_long(pc, e); 868 e->inst[1] |= (6 << 29); /* cvt */ 869 e->inst[1] |= 0x04000000; /* 32 bit */ 870 e->inst[1] |= (1 << 14); /* src .f32 */ 871 e->inst[1] |= ((1 << 6) << 14); /* .abs */ 872 set_dst(pc, dst, e); 873 set_src_0(pc, src, e); 874 875 emit(pc, e); 876} 877 878static void 879emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, 880 struct nv50_reg **src) 881{ 882 struct nv50_reg *one = alloc_immd(pc, 1.0); 883 struct nv50_reg *zero = alloc_immd(pc, 0.0); 884 struct nv50_reg *neg128 = alloc_immd(pc, -127.999999); 885 struct nv50_reg *pos128 = alloc_immd(pc, 127.999999); 886 struct nv50_reg *tmp[4]; 887 boolean allow32 = pc->allow32; 888 889 pc->allow32 = FALSE; 890 891 if (mask & (3 << 1)) { 892 tmp[0] = alloc_temp(pc, NULL); 893 emit_minmax(pc, 4, tmp[0], src[0], zero); 894 } 895 896 if (mask & (1 << 2)) { 897 set_pred_wr(pc, 1, 0, pc->p->exec_tail); 898 899 tmp[1] = temp_temp(pc); 900 emit_minmax(pc, 4, tmp[1], src[1], zero); 901 902 tmp[3] = temp_temp(pc); 903 emit_minmax(pc, 4, tmp[3], src[3], neg128); 904 emit_minmax(pc, 5, tmp[3], tmp[3], pos128); 905 906 emit_pow(pc, dst[2], tmp[1], tmp[3]); 907 emit_mov(pc, dst[2], zero); 908 set_pred(pc, 3, 0, pc->p->exec_tail); 909 } 910 911 if (mask & (1 << 1)) 912 assimilate_temp(pc, dst[1], tmp[0]); 913 else 914 if (mask & (1 << 2)) 915 free_temp(pc, tmp[0]); 916 917 pc->allow32 = allow32; 918 919 /* do this last, in case src[i,j] == dst[0,3] */ 920 if (mask & (1 << 0)) 921 emit_mov(pc, dst[0], one); 922 923 if (mask & (1 << 3)) 924 emit_mov(pc, dst[3], one); 925 926 FREE(pos128); 927 FREE(neg128); 928 FREE(zero); 929 FREE(one); 930} 931 932static void 933emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 934{ 935 struct nv50_program_exec *e = exec(pc); 936 937 set_long(pc, e); 938 e->inst[0] |= 0xa0000000; /* delta */ 939 e->inst[1] |= (7 << 29); /* delta */ 940 e->inst[1] |= 0x04000000; /* negate arg0? probably not */ 941 e->inst[1] |= (1 << 14); /* src .f32 */ 942 set_dst(pc, dst, e); 943 set_src_0(pc, src, e); 944 945 emit(pc, e); 946} 947 948static void 949emit_kil(struct nv50_pc *pc, struct nv50_reg *src) 950{ 951 struct nv50_program_exec *e; 952 const int r_pred = 1; 953 954 /* Sets predicate reg ? */ 955 e = exec(pc); 956 e->inst[0] = 0xa00001fd; 957 e->inst[1] = 0xc4014788; 958 set_src_0(pc, src, e); 959 set_pred_wr(pc, 1, r_pred, e); 960 emit(pc, e); 961 962 /* This is probably KILP */ 963 e = exec(pc); 964 e->inst[0] = 0x000001fe; 965 set_long(pc, e); 966 set_pred(pc, 1 /* LT? */, r_pred, e); 967 emit(pc, e); 968} 969 970static void 971emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, 972 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj) 973{ 974 struct nv50_reg *temp, *t[4]; 975 struct nv50_program_exec *e; 976 977 unsigned c, mode, dim; 978 979 switch (type) { 980 case TGSI_TEXTURE_1D: 981 dim = 1; 982 break; 983 case TGSI_TEXTURE_UNKNOWN: 984 case TGSI_TEXTURE_2D: 985 case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */ 986 case TGSI_TEXTURE_RECT: 987 dim = 2; 988 break; 989 case TGSI_TEXTURE_3D: 990 case TGSI_TEXTURE_CUBE: 991 case TGSI_TEXTURE_SHADOW2D: 992 case TGSI_TEXTURE_SHADOWRECT: /* XXX */ 993 dim = 3; 994 break; 995 default: 996 assert(0); 997 break; 998 } 999 1000 alloc_temp4(pc, t, 0); 1001 1002 if (proj) { 1003 if (src[0]->type == P_TEMP && src[0]->rhw != -1) { 1004 mode = pc->interp_mode[src[0]->index]; 1005 1006 t[3]->rhw = src[3]->rhw; 1007 emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID)); 1008 emit_flop(pc, 0, t[3], t[3]); 1009 1010 for (c = 0; c < dim; c++) { 1011 t[c]->rhw = src[c]->rhw; 1012 emit_interp(pc, t[c], t[3], 1013 (mode | INTERP_PERSPECTIVE)); 1014 } 1015 } else { 1016 emit_flop(pc, 0, t[3], src[3]); 1017 for (c = 0; c < dim; c++) 1018 emit_mul(pc, t[c], src[c], t[3]); 1019 1020 /* XXX: for some reason the blob sometimes uses MAD: 1021 * emit_mad(pc, t[c], src[0][c], t[3], t[3]) 1022 * pc->p->exec_tail->inst[1] |= 0x080fc000; 1023 */ 1024 } 1025 } else { 1026 if (type == TGSI_TEXTURE_CUBE) { 1027 temp = temp_temp(pc); 1028 emit_minmax(pc, 4, temp, src[0], src[1]); 1029 emit_minmax(pc, 4, temp, temp, src[2]); 1030 emit_flop(pc, 0, temp, temp); 1031 for (c = 0; c < 3; c++) 1032 emit_mul(pc, t[c], src[c], temp); 1033 } else { 1034 for (c = 0; c < dim; c++) 1035 emit_mov(pc, t[c], src[c]); 1036 } 1037 } 1038 1039 e = exec(pc); 1040 set_long(pc, e); 1041 e->inst[0] |= 0xf0000000; 1042 e->inst[1] |= 0x00000004; 1043 set_dst(pc, t[0], e); 1044 e->inst[0] |= (unit << 9); 1045 1046 if (dim == 2) 1047 e->inst[0] |= 0x00400000; 1048 else 1049 if (dim == 3) 1050 e->inst[0] |= 0x00800000; 1051 1052 e->inst[0] |= (mask & 0x3) << 25; 1053 e->inst[1] |= (mask & 0xc) << 12; 1054 1055 emit(pc, e); 1056 1057#if 1 1058 if (mask & 1) emit_mov(pc, dst[0], t[0]); 1059 if (mask & 2) emit_mov(pc, dst[1], t[1]); 1060 if (mask & 4) emit_mov(pc, dst[2], t[2]); 1061 if (mask & 8) emit_mov(pc, dst[3], t[3]); 1062 1063 free_temp4(pc, t); 1064#else 1065 /* XXX: if p.e. MUL is used directly after TEX, it would still use 1066 * the texture coordinates, not the fetched values: latency ? */ 1067 1068 for (c = 0; c < 4; c++) { 1069 if (mask & (1 << c)) 1070 assimilate_temp(pc, dst[c], t[c]); 1071 else 1072 free_temp(pc, t[c]); 1073 } 1074#endif 1075} 1076 1077static void 1078convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e) 1079{ 1080 unsigned q = 0, m = ~0; 1081 1082 assert(!is_long(e)); 1083 1084 switch (e->inst[0] >> 28) { 1085 case 0x1: 1086 /* MOV */ 1087 q = 0x0403c000; 1088 m = 0xffff7fff; 1089 break; 1090 case 0x8: 1091 /* INTERP */ 1092 m = ~0x02000000; 1093 if (e->inst[0] & 0x02000000) 1094 q = 0x00020000; 1095 break; 1096 case 0x9: 1097 /* RCP */ 1098 break; 1099 case 0xB: 1100 /* ADD */ 1101 m = ~(127 << 16); 1102 q = ((e->inst[0] & (~m)) >> 2); 1103 break; 1104 case 0xC: 1105 /* MUL */ 1106 m = ~0x00008000; 1107 q = ((e->inst[0] & (~m)) << 12); 1108 break; 1109 case 0xE: 1110 /* MAD (if src2 == dst) */ 1111 q = ((e->inst[0] & 0x1fc) << 12); 1112 break; 1113 default: 1114 assert(0); 1115 break; 1116 } 1117 1118 set_long(pc, e); 1119 pc->p->exec_size++; 1120 1121 e->inst[0] &= m; 1122 e->inst[1] |= q; 1123} 1124 1125static struct nv50_reg * 1126tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst) 1127{ 1128 switch (dst->DstRegister.File) { 1129 case TGSI_FILE_TEMPORARY: 1130 return &pc->temp[dst->DstRegister.Index * 4 + c]; 1131 case TGSI_FILE_OUTPUT: 1132 return &pc->result[dst->DstRegister.Index * 4 + c]; 1133 case TGSI_FILE_NULL: 1134 return NULL; 1135 default: 1136 break; 1137 } 1138 1139 return NULL; 1140} 1141 1142static struct nv50_reg * 1143tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src) 1144{ 1145 struct nv50_reg *r = NULL; 1146 struct nv50_reg *temp; 1147 unsigned sgn, c; 1148 1149 sgn = tgsi_util_get_full_src_register_sign_mode(src, chan); 1150 1151 c = tgsi_util_get_full_src_register_extswizzle(src, chan); 1152 switch (c) { 1153 case TGSI_EXTSWIZZLE_X: 1154 case TGSI_EXTSWIZZLE_Y: 1155 case TGSI_EXTSWIZZLE_Z: 1156 case TGSI_EXTSWIZZLE_W: 1157 switch (src->SrcRegister.File) { 1158 case TGSI_FILE_INPUT: 1159 r = &pc->attr[src->SrcRegister.Index * 4 + c]; 1160 break; 1161 case TGSI_FILE_TEMPORARY: 1162 r = &pc->temp[src->SrcRegister.Index * 4 + c]; 1163 break; 1164 case TGSI_FILE_CONSTANT: 1165 r = &pc->param[src->SrcRegister.Index * 4 + c]; 1166 break; 1167 case TGSI_FILE_IMMEDIATE: 1168 r = &pc->immd[src->SrcRegister.Index * 4 + c]; 1169 break; 1170 case TGSI_FILE_SAMPLER: 1171 break; 1172 default: 1173 assert(0); 1174 break; 1175 } 1176 break; 1177 case TGSI_EXTSWIZZLE_ZERO: 1178 r = alloc_immd(pc, 0.0); 1179 return r; 1180 case TGSI_EXTSWIZZLE_ONE: 1181 if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET) 1182 return alloc_immd(pc, -1.0); 1183 return alloc_immd(pc, 1.0); 1184 default: 1185 assert(0); 1186 break; 1187 } 1188 1189 switch (sgn) { 1190 case TGSI_UTIL_SIGN_KEEP: 1191 break; 1192 case TGSI_UTIL_SIGN_CLEAR: 1193 temp = temp_temp(pc); 1194 emit_abs(pc, temp, r); 1195 r = temp; 1196 break; 1197 case TGSI_UTIL_SIGN_TOGGLE: 1198 temp = temp_temp(pc); 1199 emit_neg(pc, temp, r); 1200 r = temp; 1201 break; 1202 case TGSI_UTIL_SIGN_SET: 1203 temp = temp_temp(pc); 1204 emit_abs(pc, temp, r); 1205 emit_neg(pc, temp, temp); 1206 r = temp; 1207 break; 1208 default: 1209 assert(0); 1210 break; 1211 } 1212 1213 return r; 1214} 1215 1216/* returns TRUE if instruction can overwrite sources before they're read */ 1217static boolean 1218direct2dest_op(const struct tgsi_full_instruction *insn) 1219{ 1220 if (insn->Instruction.Saturate) 1221 return FALSE; 1222 1223 switch (insn->Instruction.Opcode) { 1224 case TGSI_OPCODE_COS: 1225 case TGSI_OPCODE_DP3: 1226 case TGSI_OPCODE_DP4: 1227 case TGSI_OPCODE_DPH: 1228 case TGSI_OPCODE_KIL: 1229 case TGSI_OPCODE_LIT: 1230 case TGSI_OPCODE_POW: 1231 case TGSI_OPCODE_RCP: 1232 case TGSI_OPCODE_RSQ: 1233 case TGSI_OPCODE_SCS: 1234 case TGSI_OPCODE_SIN: 1235 case TGSI_OPCODE_TEX: 1236 case TGSI_OPCODE_TXP: 1237 return FALSE; 1238 default: 1239 return TRUE; 1240 } 1241} 1242 1243static boolean 1244nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) 1245{ 1246 const struct tgsi_full_instruction *inst = &tok->FullInstruction; 1247 struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp; 1248 unsigned mask, sat, unit; 1249 boolean assimilate = FALSE; 1250 int i, c; 1251 1252 mask = inst->FullDstRegisters[0].DstRegister.WriteMask; 1253 sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE; 1254 1255 for (c = 0; c < 4; c++) { 1256 if (mask & (1 << c)) 1257 dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]); 1258 else 1259 dst[c] = NULL; 1260 rdst[c] = NULL; 1261 src[0][c] = NULL; 1262 src[1][c] = NULL; 1263 src[2][c] = NULL; 1264 } 1265 1266 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 1267 const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i]; 1268 1269 if (fs->SrcRegister.File == TGSI_FILE_SAMPLER) 1270 unit = fs->SrcRegister.Index; 1271 1272 for (c = 0; c < 4; c++) 1273 src[i][c] = tgsi_src(pc, c, fs); 1274 } 1275 1276 if (sat) { 1277 for (c = 0; c < 4; c++) { 1278 rdst[c] = dst[c]; 1279 dst[c] = temp_temp(pc); 1280 } 1281 } else 1282 if (direct2dest_op(inst)) { 1283 for (c = 0; c < 4; c++) { 1284 if (!dst[c] || dst[c]->type != P_TEMP) 1285 continue; 1286 1287 for (i = c + 1; i < 4; i++) { 1288 if (dst[c] == src[0][i] || 1289 dst[c] == src[1][i] || 1290 dst[c] == src[2][i]) 1291 break; 1292 } 1293 if (i == 4) 1294 continue; 1295 1296 assimilate = TRUE; 1297 rdst[c] = dst[c]; 1298 dst[c] = alloc_temp(pc, NULL); 1299 } 1300 } 1301 1302 switch (inst->Instruction.Opcode) { 1303 case TGSI_OPCODE_ABS: 1304 for (c = 0; c < 4; c++) { 1305 if (!(mask & (1 << c))) 1306 continue; 1307 emit_abs(pc, dst[c], src[0][c]); 1308 } 1309 break; 1310 case TGSI_OPCODE_ADD: 1311 for (c = 0; c < 4; c++) { 1312 if (!(mask & (1 << c))) 1313 continue; 1314 emit_add(pc, dst[c], src[0][c], src[1][c]); 1315 } 1316 break; 1317 case TGSI_OPCODE_COS: 1318 temp = temp_temp(pc); 1319 emit_precossin(pc, temp, src[0][0]); 1320 emit_flop(pc, 5, temp, temp); 1321 for (c = 0; c < 4; c++) { 1322 if (!(mask & (1 << c))) 1323 continue; 1324 emit_mov(pc, dst[c], temp); 1325 } 1326 break; 1327 case TGSI_OPCODE_DP3: 1328 temp = temp_temp(pc); 1329 emit_mul(pc, temp, src[0][0], src[1][0]); 1330 emit_mad(pc, temp, src[0][1], src[1][1], temp); 1331 emit_mad(pc, temp, src[0][2], src[1][2], temp); 1332 for (c = 0; c < 4; c++) { 1333 if (!(mask & (1 << c))) 1334 continue; 1335 emit_mov(pc, dst[c], temp); 1336 } 1337 break; 1338 case TGSI_OPCODE_DP4: 1339 temp = temp_temp(pc); 1340 emit_mul(pc, temp, src[0][0], src[1][0]); 1341 emit_mad(pc, temp, src[0][1], src[1][1], temp); 1342 emit_mad(pc, temp, src[0][2], src[1][2], temp); 1343 emit_mad(pc, temp, src[0][3], src[1][3], temp); 1344 for (c = 0; c < 4; c++) { 1345 if (!(mask & (1 << c))) 1346 continue; 1347 emit_mov(pc, dst[c], temp); 1348 } 1349 break; 1350 case TGSI_OPCODE_DPH: 1351 temp = temp_temp(pc); 1352 emit_mul(pc, temp, src[0][0], src[1][0]); 1353 emit_mad(pc, temp, src[0][1], src[1][1], temp); 1354 emit_mad(pc, temp, src[0][2], src[1][2], temp); 1355 emit_add(pc, temp, src[1][3], temp); 1356 for (c = 0; c < 4; c++) { 1357 if (!(mask & (1 << c))) 1358 continue; 1359 emit_mov(pc, dst[c], temp); 1360 } 1361 break; 1362 case TGSI_OPCODE_DST: 1363 { 1364 struct nv50_reg *one = alloc_immd(pc, 1.0); 1365 if (mask & (1 << 0)) 1366 emit_mov(pc, dst[0], one); 1367 if (mask & (1 << 1)) 1368 emit_mul(pc, dst[1], src[0][1], src[1][1]); 1369 if (mask & (1 << 2)) 1370 emit_mov(pc, dst[2], src[0][2]); 1371 if (mask & (1 << 3)) 1372 emit_mov(pc, dst[3], src[1][3]); 1373 FREE(one); 1374 } 1375 break; 1376 case TGSI_OPCODE_EX2: 1377 temp = temp_temp(pc); 1378 emit_preex2(pc, temp, src[0][0]); 1379 emit_flop(pc, 6, temp, temp); 1380 for (c = 0; c < 4; c++) { 1381 if (!(mask & (1 << c))) 1382 continue; 1383 emit_mov(pc, dst[c], temp); 1384 } 1385 break; 1386 case TGSI_OPCODE_FLR: 1387 for (c = 0; c < 4; c++) { 1388 if (!(mask & (1 << c))) 1389 continue; 1390 emit_flr(pc, dst[c], src[0][c]); 1391 } 1392 break; 1393 case TGSI_OPCODE_FRC: 1394 temp = temp_temp(pc); 1395 for (c = 0; c < 4; c++) { 1396 if (!(mask & (1 << c))) 1397 continue; 1398 emit_flr(pc, temp, src[0][c]); 1399 emit_sub(pc, dst[c], src[0][c], temp); 1400 } 1401 break; 1402 case TGSI_OPCODE_KIL: 1403 emit_kil(pc, src[0][0]); 1404 emit_kil(pc, src[0][1]); 1405 emit_kil(pc, src[0][2]); 1406 emit_kil(pc, src[0][3]); 1407 pc->p->cfg.fp.regs[2] |= 0x00100000; 1408 break; 1409 case TGSI_OPCODE_LIT: 1410 emit_lit(pc, &dst[0], mask, &src[0][0]); 1411 break; 1412 case TGSI_OPCODE_LG2: 1413 temp = temp_temp(pc); 1414 emit_flop(pc, 3, temp, src[0][0]); 1415 for (c = 0; c < 4; c++) { 1416 if (!(mask & (1 << c))) 1417 continue; 1418 emit_mov(pc, dst[c], temp); 1419 } 1420 break; 1421 case TGSI_OPCODE_LRP: 1422 temp = temp_temp(pc); 1423 for (c = 0; c < 4; c++) { 1424 if (!(mask & (1 << c))) 1425 continue; 1426 emit_sub(pc, temp, src[1][c], src[2][c]); 1427 emit_mad(pc, dst[c], temp, src[0][c], src[2][c]); 1428 } 1429 break; 1430 case TGSI_OPCODE_MAD: 1431 for (c = 0; c < 4; c++) { 1432 if (!(mask & (1 << c))) 1433 continue; 1434 emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]); 1435 } 1436 break; 1437 case TGSI_OPCODE_MAX: 1438 for (c = 0; c < 4; c++) { 1439 if (!(mask & (1 << c))) 1440 continue; 1441 emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]); 1442 } 1443 break; 1444 case TGSI_OPCODE_MIN: 1445 for (c = 0; c < 4; c++) { 1446 if (!(mask & (1 << c))) 1447 continue; 1448 emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]); 1449 } 1450 break; 1451 case TGSI_OPCODE_MOV: 1452 for (c = 0; c < 4; c++) { 1453 if (!(mask & (1 << c))) 1454 continue; 1455 emit_mov(pc, dst[c], src[0][c]); 1456 } 1457 break; 1458 case TGSI_OPCODE_MUL: 1459 for (c = 0; c < 4; c++) { 1460 if (!(mask & (1 << c))) 1461 continue; 1462 emit_mul(pc, dst[c], src[0][c], src[1][c]); 1463 } 1464 break; 1465 case TGSI_OPCODE_POW: 1466 temp = temp_temp(pc); 1467 emit_pow(pc, temp, src[0][0], src[1][0]); 1468 for (c = 0; c < 4; c++) { 1469 if (!(mask & (1 << c))) 1470 continue; 1471 emit_mov(pc, dst[c], temp); 1472 } 1473 break; 1474 case TGSI_OPCODE_RCP: 1475 for (c = 3; c >= 0; c--) { 1476 if (!(mask & (1 << c))) 1477 continue; 1478 emit_flop(pc, 0, dst[c], src[0][0]); 1479 } 1480 break; 1481 case TGSI_OPCODE_RSQ: 1482 for (c = 3; c >= 0; c--) { 1483 if (!(mask & (1 << c))) 1484 continue; 1485 emit_flop(pc, 2, dst[c], src[0][0]); 1486 } 1487 break; 1488 case TGSI_OPCODE_SCS: 1489 temp = temp_temp(pc); 1490 emit_precossin(pc, temp, src[0][0]); 1491 if (mask & (1 << 0)) 1492 emit_flop(pc, 5, dst[0], temp); 1493 if (mask & (1 << 1)) 1494 emit_flop(pc, 4, dst[1], temp); 1495 if (mask & (1 << 2)) 1496 emit_mov_immdval(pc, dst[2], 0.0); 1497 if (mask & (1 << 3)) 1498 emit_mov_immdval(pc, dst[3], 1.0); 1499 break; 1500 case TGSI_OPCODE_SGE: 1501 for (c = 0; c < 4; c++) { 1502 if (!(mask & (1 << c))) 1503 continue; 1504 emit_set(pc, 6, dst[c], src[0][c], src[1][c]); 1505 } 1506 break; 1507 case TGSI_OPCODE_SIN: 1508 temp = temp_temp(pc); 1509 emit_precossin(pc, temp, src[0][0]); 1510 emit_flop(pc, 4, temp, temp); 1511 for (c = 0; c < 4; c++) { 1512 if (!(mask & (1 << c))) 1513 continue; 1514 emit_mov(pc, dst[c], temp); 1515 } 1516 break; 1517 case TGSI_OPCODE_SLT: 1518 for (c = 0; c < 4; c++) { 1519 if (!(mask & (1 << c))) 1520 continue; 1521 emit_set(pc, 1, dst[c], src[0][c], src[1][c]); 1522 } 1523 break; 1524 case TGSI_OPCODE_SUB: 1525 for (c = 0; c < 4; c++) { 1526 if (!(mask & (1 << c))) 1527 continue; 1528 emit_sub(pc, dst[c], src[0][c], src[1][c]); 1529 } 1530 break; 1531 case TGSI_OPCODE_TEX: 1532 emit_tex(pc, dst, mask, src[0], unit, 1533 inst->InstructionExtTexture.Texture, FALSE); 1534 break; 1535 case TGSI_OPCODE_TXP: 1536 emit_tex(pc, dst, mask, src[0], unit, 1537 inst->InstructionExtTexture.Texture, TRUE); 1538 break; 1539 case TGSI_OPCODE_XPD: 1540 temp = temp_temp(pc); 1541 if (mask & (1 << 0)) { 1542 emit_mul(pc, temp, src[0][2], src[1][1]); 1543 emit_msb(pc, dst[0], src[0][1], src[1][2], temp); 1544 } 1545 if (mask & (1 << 1)) { 1546 emit_mul(pc, temp, src[0][0], src[1][2]); 1547 emit_msb(pc, dst[1], src[0][2], src[1][0], temp); 1548 } 1549 if (mask & (1 << 2)) { 1550 emit_mul(pc, temp, src[0][1], src[1][0]); 1551 emit_msb(pc, dst[2], src[0][0], src[1][1], temp); 1552 } 1553 if (mask & (1 << 3)) 1554 emit_mov_immdval(pc, dst[3], 1.0); 1555 break; 1556 case TGSI_OPCODE_END: 1557 break; 1558 default: 1559 NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode); 1560 return FALSE; 1561 } 1562 1563 if (sat) { 1564 for (c = 0; c < 4; c++) { 1565 struct nv50_program_exec *e; 1566 1567 if (!(mask & (1 << c))) 1568 continue; 1569 e = exec(pc); 1570 1571 e->inst[0] = 0xa0000000; /* cvt */ 1572 set_long(pc, e); 1573 e->inst[1] |= (6 << 29); /* cvt */ 1574 e->inst[1] |= 0x04000000; /* 32 bit */ 1575 e->inst[1] |= (1 << 14); /* src .f32 */ 1576 e->inst[1] |= ((1 << 5) << 14); /* .sat */ 1577 set_dst(pc, rdst[c], e); 1578 set_src_0(pc, dst[c], e); 1579 emit(pc, e); 1580 } 1581 } else if (assimilate) { 1582 for (c = 0; c < 4; c++) 1583 if (rdst[c]) 1584 assimilate_temp(pc, rdst[c], dst[c]); 1585 } 1586 1587 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 1588 for (c = 0; c < 4; c++) { 1589 if (!src[i][c]) 1590 continue; 1591 if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD) 1592 FREE(src[i][c]); 1593 else 1594 if (src[i][c]->acc == pc->insn_cur) 1595 release_hw(pc, src[i][c]); 1596 } 1597 } 1598 1599 kill_temp_temp(pc); 1600 return TRUE; 1601} 1602 1603/* Adjust a bitmask that indicates what components of a source are used, 1604 * we use this in tx_prep so we only load interpolants that are needed. 1605 */ 1606static void 1607insn_adjust_mask(const struct tgsi_full_instruction *insn, unsigned *mask) 1608{ 1609 const struct tgsi_instruction_ext_texture *tex; 1610 1611 switch (insn->Instruction.Opcode) { 1612 case TGSI_OPCODE_DP3: 1613 *mask = 0x7; 1614 break; 1615 case TGSI_OPCODE_DP4: 1616 case TGSI_OPCODE_DPH: 1617 *mask = 0xF; 1618 break; 1619 case TGSI_OPCODE_LIT: 1620 *mask = 0xB; 1621 break; 1622 case TGSI_OPCODE_RCP: 1623 case TGSI_OPCODE_RSQ: 1624 *mask = 0x1; 1625 break; 1626 case TGSI_OPCODE_TEX: 1627 case TGSI_OPCODE_TXP: 1628 assert(insn->Instruction.Extended); 1629 tex = &insn->InstructionExtTexture; 1630 1631 *mask = 0x7; 1632 if (tex->Texture == TGSI_TEXTURE_1D) 1633 *mask = 0x1; 1634 else 1635 if (tex->Texture == TGSI_TEXTURE_2D) 1636 *mask = 0x3; 1637 1638 if (insn->Instruction.Opcode == TGSI_OPCODE_TXP) 1639 *mask |= 0x8; 1640 break; 1641 default: 1642 break; 1643 } 1644} 1645 1646static void 1647prep_inspect_insn(struct nv50_pc *pc, const union tgsi_full_token *tok, 1648 unsigned *r_usage[2]) 1649{ 1650 const struct tgsi_full_instruction *insn; 1651 const struct tgsi_full_src_register *src; 1652 const struct tgsi_dst_register *dst; 1653 1654 unsigned i, c, k, n, mask, *acc_p; 1655 1656 insn = &tok->FullInstruction; 1657 dst = &insn->FullDstRegisters[0].DstRegister; 1658 mask = dst->WriteMask; 1659 1660 if (!r_usage[0]) 1661 r_usage[0] = CALLOC(pc->temp_nr * 4, sizeof(unsigned)); 1662 if (!r_usage[1]) 1663 r_usage[1] = CALLOC(pc->attr_nr * 4, sizeof(unsigned)); 1664 1665 if (dst->File == TGSI_FILE_TEMPORARY) { 1666 for (c = 0; c < 4; c++) { 1667 if (!(mask & (1 << c))) 1668 continue; 1669 r_usage[0][dst->Index * 4 + c] = pc->insn_nr; 1670 } 1671 } 1672 1673 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { 1674 src = &insn->FullSrcRegisters[i]; 1675 1676 switch (src->SrcRegister.File) { 1677 case TGSI_FILE_TEMPORARY: 1678 acc_p = r_usage[0]; 1679 break; 1680 case TGSI_FILE_INPUT: 1681 acc_p = r_usage[1]; 1682 break; 1683 default: 1684 continue; 1685 } 1686 1687 insn_adjust_mask(insn, &mask); 1688 1689 for (c = 0; c < 4; c++) { 1690 if (!(mask & (1 << c))) 1691 continue; 1692 1693 k = tgsi_util_get_full_src_register_extswizzle(src, c); 1694 switch (k) { 1695 case TGSI_EXTSWIZZLE_X: 1696 case TGSI_EXTSWIZZLE_Y: 1697 case TGSI_EXTSWIZZLE_Z: 1698 case TGSI_EXTSWIZZLE_W: 1699 n = src->SrcRegister.Index * 4 + k; 1700 acc_p[n] = pc->insn_nr; 1701 break; 1702 default: 1703 break; 1704 } 1705 } 1706 } 1707} 1708 1709static unsigned 1710load_fp_attrib(struct nv50_pc *pc, int i, unsigned *acc, int *mid, 1711 int *aid, int *p_oid) 1712{ 1713 struct nv50_reg *iv; 1714 int oid, c, n; 1715 unsigned mask = 0; 1716 1717 iv = (pc->interp_mode[i] & INTERP_CENTROID) ? pc->iv_c : pc->iv_p; 1718 1719 for (c = 0, n = i * 4; c < 4; c++, n++) { 1720 oid = (*p_oid)++; 1721 pc->attr[n].type = P_TEMP; 1722 pc->attr[n].index = i; 1723 1724 if (pc->attr[n].acc == acc[n]) 1725 continue; 1726 mask |= (1 << c); 1727 1728 pc->attr[n].acc = acc[n]; 1729 pc->attr[n].rhw = pc->attr[n].hw = -1; 1730 alloc_reg(pc, &pc->attr[n]); 1731 1732 pc->attr[n].rhw = (*aid)++; 1733 emit_interp(pc, &pc->attr[n], iv, pc->interp_mode[i]); 1734 1735 pc->p->cfg.fp.map[(*mid) / 4] |= oid << (8 * ((*mid) % 4)); 1736 (*mid)++; 1737 pc->p->cfg.fp.regs[1] += 0x00010001; 1738 } 1739 1740 return mask; 1741} 1742 1743static boolean 1744nv50_program_tx_prep(struct nv50_pc *pc) 1745{ 1746 struct tgsi_parse_context p; 1747 boolean ret = FALSE; 1748 unsigned i, c; 1749 unsigned fcol, bcol, fcrd, depr; 1750 1751 /* count (centroid) perspective interpolations */ 1752 unsigned centroid_loads = 0; 1753 unsigned perspect_loads = 0; 1754 1755 /* track register access for temps and attrs */ 1756 unsigned *r_usage[2]; 1757 r_usage[0] = NULL; 1758 r_usage[1] = NULL; 1759 1760 depr = fcol = bcol = fcrd = 0xffff; 1761 1762 if (pc->p->type == PIPE_SHADER_FRAGMENT) { 1763 pc->p->cfg.fp.regs[0] = 0x01000404; 1764 pc->p->cfg.fp.regs[1] = 0x00000400; 1765 } 1766 1767 tgsi_parse_init(&p, pc->p->pipe.tokens); 1768 while (!tgsi_parse_end_of_tokens(&p)) { 1769 const union tgsi_full_token *tok = &p.FullToken; 1770 1771 tgsi_parse_token(&p); 1772 switch (tok->Token.Type) { 1773 case TGSI_TOKEN_TYPE_IMMEDIATE: 1774 { 1775 const struct tgsi_full_immediate *imm = 1776 &p.FullToken.FullImmediate; 1777 1778 ctor_immd(pc, imm->u.ImmediateFloat32[0].Float, 1779 imm->u.ImmediateFloat32[1].Float, 1780 imm->u.ImmediateFloat32[2].Float, 1781 imm->u.ImmediateFloat32[3].Float); 1782 } 1783 break; 1784 case TGSI_TOKEN_TYPE_DECLARATION: 1785 { 1786 const struct tgsi_full_declaration *d; 1787 unsigned last, first, mode; 1788 1789 d = &p.FullToken.FullDeclaration; 1790 first = d->DeclarationRange.First; 1791 last = d->DeclarationRange.Last; 1792 1793 switch (d->Declaration.File) { 1794 case TGSI_FILE_TEMPORARY: 1795 if (pc->temp_nr < (last + 1)) 1796 pc->temp_nr = last + 1; 1797 break; 1798 case TGSI_FILE_OUTPUT: 1799 if (pc->result_nr < (last + 1)) 1800 pc->result_nr = last + 1; 1801 1802 if (!d->Declaration.Semantic) 1803 break; 1804 1805 switch (d->Semantic.SemanticName) { 1806 case TGSI_SEMANTIC_POSITION: 1807 depr = first; 1808 pc->p->cfg.fp.regs[2] |= 0x00000100; 1809 pc->p->cfg.fp.regs[3] |= 0x00000011; 1810 break; 1811 default: 1812 break; 1813 } 1814 1815 break; 1816 case TGSI_FILE_INPUT: 1817 { 1818 if (pc->attr_nr < (last + 1)) 1819 pc->attr_nr = last + 1; 1820 1821 if (pc->p->type != PIPE_SHADER_FRAGMENT) 1822 break; 1823 1824 switch (d->Declaration.Interpolate) { 1825 case TGSI_INTERPOLATE_CONSTANT: 1826 mode = INTERP_FLAT; 1827 break; 1828 case TGSI_INTERPOLATE_PERSPECTIVE: 1829 mode = INTERP_PERSPECTIVE; 1830 break; 1831 default: 1832 mode = INTERP_LINEAR; 1833 break; 1834 } 1835 1836 if (d->Declaration.Semantic) { 1837 switch (d->Semantic.SemanticName) { 1838 case TGSI_SEMANTIC_POSITION: 1839 fcrd = first; 1840 break; 1841 case TGSI_SEMANTIC_COLOR: 1842 fcol = first; 1843 mode = INTERP_PERSPECTIVE; 1844 break; 1845 case TGSI_SEMANTIC_BCOLOR: 1846 bcol = first; 1847 mode = INTERP_PERSPECTIVE; 1848 break; 1849 } 1850 } 1851 1852 if (d->Declaration.Centroid) { 1853 mode |= INTERP_CENTROID; 1854 if (mode & INTERP_PERSPECTIVE) 1855 centroid_loads++; 1856 } else 1857 if (mode & INTERP_PERSPECTIVE) 1858 perspect_loads++; 1859 1860 assert(last < 32); 1861 for (i = first; i <= last; i++) 1862 pc->interp_mode[i] = mode; 1863 } 1864 break; 1865 case TGSI_FILE_CONSTANT: 1866 if (pc->param_nr < (last + 1)) 1867 pc->param_nr = last + 1; 1868 break; 1869 case TGSI_FILE_SAMPLER: 1870 break; 1871 default: 1872 NOUVEAU_ERR("bad decl file %d\n", 1873 d->Declaration.File); 1874 goto out_err; 1875 } 1876 } 1877 break; 1878 case TGSI_TOKEN_TYPE_INSTRUCTION: 1879 pc->insn_nr++; 1880 prep_inspect_insn(pc, tok, r_usage); 1881 break; 1882 default: 1883 break; 1884 } 1885 } 1886 1887 if (pc->temp_nr) { 1888 pc->temp = CALLOC(pc->temp_nr * 4, sizeof(struct nv50_reg)); 1889 if (!pc->temp) 1890 goto out_err; 1891 1892 for (i = 0; i < pc->temp_nr; i++) { 1893 for (c = 0; c < 4; c++) { 1894 pc->temp[i*4+c].type = P_TEMP; 1895 pc->temp[i*4+c].hw = -1; 1896 pc->temp[i*4+c].rhw = -1; 1897 pc->temp[i*4+c].index = i; 1898 pc->temp[i*4+c].acc = r_usage[0][i*4+c]; 1899 } 1900 } 1901 } 1902 1903 if (pc->attr_nr) { 1904 int oid = 4, mid = 4, aid = 0; 1905 /* oid = VP output id 1906 * aid = FP attribute/interpolant id 1907 * mid = VP output mapping field ID 1908 */ 1909 1910 pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg)); 1911 if (!pc->attr) 1912 goto out_err; 1913 1914 if (pc->p->type == PIPE_SHADER_FRAGMENT) { 1915 /* position should be loaded first */ 1916 if (fcrd != 0xffff) { 1917 unsigned mask; 1918 mid = 0; 1919 mask = load_fp_attrib(pc, fcrd, r_usage[1], 1920 &mid, &aid, &oid); 1921 oid = 0; 1922 pc->p->cfg.fp.regs[1] |= (mask << 24); 1923 pc->p->cfg.fp.map[0] = 0x04040404 * fcrd; 1924 } 1925 pc->p->cfg.fp.map[0] += 0x03020100; 1926 1927 /* should do MAD fcrd.xy, fcrd, SOME_CONST, fcrd */ 1928 1929 if (perspect_loads) { 1930 pc->iv_p = alloc_temp(pc, NULL); 1931 1932 if (!(pc->p->cfg.fp.regs[1] & 0x08000000)) { 1933 pc->p->cfg.fp.regs[1] |= 0x08000000; 1934 pc->iv_p->rhw = aid++; 1935 emit_interp(pc, pc->iv_p, NULL, 1936 INTERP_LINEAR); 1937 emit_flop(pc, 0, pc->iv_p, pc->iv_p); 1938 } else { 1939 pc->iv_p->rhw = aid - 1; 1940 emit_flop(pc, 0, pc->iv_p, 1941 &pc->attr[fcrd * 4 + 3]); 1942 } 1943 } 1944 1945 if (centroid_loads) { 1946 pc->iv_c = alloc_temp(pc, NULL); 1947 pc->iv_c->rhw = pc->iv_p ? aid - 1 : aid++; 1948 emit_interp(pc, pc->iv_c, NULL, 1949 INTERP_CENTROID); 1950 emit_flop(pc, 0, pc->iv_c, pc->iv_c); 1951 pc->p->cfg.fp.regs[1] |= 0x08000000; 1952 } 1953 1954 for (c = 0; c < 4; c++) { 1955 /* I don't know what these values do, but 1956 * let's set them like the blob does: 1957 */ 1958 if (fcol != 0xffff && r_usage[1][fcol * 4 + c]) 1959 pc->p->cfg.fp.regs[0] += 0x00010000; 1960 if (bcol != 0xffff && r_usage[1][bcol * 4 + c]) 1961 pc->p->cfg.fp.regs[0] += 0x00010000; 1962 } 1963 1964 for (i = 0; i < pc->attr_nr; i++) 1965 load_fp_attrib(pc, i, r_usage[1], 1966 &mid, &aid, &oid); 1967 1968 if (pc->iv_p) 1969 free_temp(pc, pc->iv_p); 1970 if (pc->iv_c) 1971 free_temp(pc, pc->iv_c); 1972 1973 pc->p->cfg.fp.high_map = (mid / 4); 1974 pc->p->cfg.fp.high_map += ((mid % 4) ? 1 : 0); 1975 } else { 1976 /* vertex program */ 1977 for (i = 0; i < pc->attr_nr * 4; i++) { 1978 pc->p->cfg.vp.attr[aid / 32] |= 1979 (1 << (aid % 32)); 1980 pc->attr[i].type = P_ATTR; 1981 pc->attr[i].hw = aid++; 1982 pc->attr[i].index = i / 4; 1983 } 1984 } 1985 } 1986 1987 if (pc->result_nr) { 1988 int rid = 0; 1989 1990 pc->result = CALLOC(pc->result_nr * 4, sizeof(struct nv50_reg)); 1991 if (!pc->result) 1992 goto out_err; 1993 1994 for (i = 0; i < pc->result_nr; i++) { 1995 for (c = 0; c < 4; c++) { 1996 if (pc->p->type == PIPE_SHADER_FRAGMENT) { 1997 pc->result[i*4+c].type = P_TEMP; 1998 pc->result[i*4+c].hw = -1; 1999 pc->result[i*4+c].rhw = (i == depr) ? 2000 -1 : rid++; 2001 } else { 2002 pc->result[i*4+c].type = P_RESULT; 2003 pc->result[i*4+c].hw = rid++; 2004 } 2005 pc->result[i*4+c].index = i; 2006 } 2007 2008 if (pc->p->type == PIPE_SHADER_FRAGMENT && 2009 depr != 0xffff) { 2010 pc->result[depr * 4 + 2].rhw = 2011 (pc->result_nr - 1) * 4; 2012 } 2013 } 2014 } 2015 2016 if (pc->param_nr) { 2017 int rid = 0; 2018 2019 pc->param = CALLOC(pc->param_nr * 4, sizeof(struct nv50_reg)); 2020 if (!pc->param) 2021 goto out_err; 2022 2023 for (i = 0; i < pc->param_nr; i++) { 2024 for (c = 0; c < 4; c++) { 2025 pc->param[i*4+c].type = P_CONST; 2026 pc->param[i*4+c].hw = rid++; 2027 pc->param[i*4+c].index = i; 2028 } 2029 } 2030 } 2031 2032 if (pc->immd_nr) { 2033 int rid = 0; 2034 2035 pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg)); 2036 if (!pc->immd) 2037 goto out_err; 2038 2039 for (i = 0; i < pc->immd_nr; i++) { 2040 for (c = 0; c < 4; c++) { 2041 pc->immd[i*4+c].type = P_IMMD; 2042 pc->immd[i*4+c].hw = rid++; 2043 pc->immd[i*4+c].index = i; 2044 } 2045 } 2046 } 2047 2048 ret = TRUE; 2049out_err: 2050 if (r_usage[0]) 2051 FREE(r_usage[0]); 2052 if (r_usage[1]) 2053 FREE(r_usage[1]); 2054 2055 tgsi_parse_free(&p); 2056 return ret; 2057} 2058 2059static void 2060free_nv50_pc(struct nv50_pc *pc) 2061{ 2062 if (pc->immd) 2063 FREE(pc->immd); 2064 if (pc->param) 2065 FREE(pc->param); 2066 if (pc->result) 2067 FREE(pc->result); 2068 if (pc->attr) 2069 FREE(pc->attr); 2070 if (pc->temp) 2071 FREE(pc->temp); 2072 2073 FREE(pc); 2074} 2075 2076static boolean 2077nv50_program_tx(struct nv50_program *p) 2078{ 2079 struct tgsi_parse_context parse; 2080 struct nv50_pc *pc; 2081 unsigned k; 2082 boolean ret; 2083 2084 pc = CALLOC_STRUCT(nv50_pc); 2085 if (!pc) 2086 return FALSE; 2087 pc->p = p; 2088 pc->p->cfg.high_temp = 4; 2089 2090 ret = nv50_program_tx_prep(pc); 2091 if (ret == FALSE) 2092 goto out_cleanup; 2093 2094 tgsi_parse_init(&parse, pc->p->pipe.tokens); 2095 while (!tgsi_parse_end_of_tokens(&parse)) { 2096 const union tgsi_full_token *tok = &parse.FullToken; 2097 2098 /* don't allow half insn/immd on first and last instruction */ 2099 pc->allow32 = TRUE; 2100 if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr) 2101 pc->allow32 = FALSE; 2102 2103 tgsi_parse_token(&parse); 2104 2105 switch (tok->Token.Type) { 2106 case TGSI_TOKEN_TYPE_INSTRUCTION: 2107 ++pc->insn_cur; 2108 ret = nv50_program_tx_insn(pc, tok); 2109 if (ret == FALSE) 2110 goto out_err; 2111 break; 2112 default: 2113 break; 2114 } 2115 } 2116 2117 if (p->type == PIPE_SHADER_FRAGMENT) { 2118 struct nv50_reg out; 2119 2120 out.type = P_TEMP; 2121 for (k = 0; k < pc->result_nr * 4; k++) { 2122 if (pc->result[k].rhw == -1) 2123 continue; 2124 if (pc->result[k].hw != pc->result[k].rhw) { 2125 out.hw = pc->result[k].rhw; 2126 emit_mov(pc, &out, &pc->result[k]); 2127 } 2128 if (pc->p->cfg.high_result < (pc->result[k].rhw + 1)) 2129 pc->p->cfg.high_result = pc->result[k].rhw + 1; 2130 } 2131 } 2132 2133 /* look for single half instructions and make them long */ 2134 struct nv50_program_exec *e, *e_prev; 2135 2136 for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) { 2137 if (!is_long(e)) 2138 k++; 2139 2140 if (!e->next || is_long(e->next)) { 2141 if (k & 1) 2142 convert_to_long(pc, e); 2143 k = 0; 2144 } 2145 2146 if (e->next) 2147 e_prev = e; 2148 } 2149 2150 if (!is_long(pc->p->exec_tail)) { 2151 /* this may occur if moving FP results */ 2152 assert(e_prev && !is_long(e_prev)); 2153 convert_to_long(pc, e_prev); 2154 convert_to_long(pc, pc->p->exec_tail); 2155 } 2156 2157 assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head)); 2158 pc->p->exec_tail->inst[1] |= 0x00000001; 2159 2160 p->param_nr = pc->param_nr * 4; 2161 p->immd_nr = pc->immd_nr * 4; 2162 p->immd = pc->immd_buf; 2163 2164out_err: 2165 tgsi_parse_free(&parse); 2166 2167out_cleanup: 2168 free_nv50_pc(pc); 2169 return ret; 2170} 2171 2172static void 2173nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p) 2174{ 2175 if (nv50_program_tx(p) == FALSE) 2176 assert(0); 2177 p->translated = TRUE; 2178} 2179 2180static void 2181nv50_program_upload_data(struct nv50_context *nv50, float *map, 2182 unsigned start, unsigned count, unsigned cbuf) 2183{ 2184 struct nouveau_channel *chan = nv50->screen->nvws->channel; 2185 struct nouveau_grobj *tesla = nv50->screen->tesla; 2186 2187 while (count) { 2188 unsigned nr = count > 2047 ? 2047 : count; 2189 2190 BEGIN_RING(chan, tesla, 0x00000f00, 1); 2191 OUT_RING (chan, (cbuf << 0) | (start << 8)); 2192 BEGIN_RING(chan, tesla, 0x40000f04, nr); 2193 OUT_RINGp (chan, map, nr); 2194 2195 map += nr; 2196 start += nr; 2197 count -= nr; 2198 } 2199} 2200 2201static void 2202nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p) 2203{ 2204 struct nouveau_winsys *nvws = nv50->screen->nvws; 2205 struct pipe_winsys *ws = nv50->pipe.winsys; 2206 2207 if (!p->data[0] && p->immd_nr) { 2208 struct nouveau_resource *heap = nv50->screen->immd_heap[0]; 2209 2210 if (nvws->res_alloc(heap, p->immd_nr, p, &p->data[0])) { 2211 while (heap->next && heap->size < p->immd_nr) { 2212 struct nv50_program *evict = heap->next->priv; 2213 nvws->res_free(&evict->data[0]); 2214 } 2215 2216 if (nvws->res_alloc(heap, p->immd_nr, p, &p->data[0])) 2217 assert(0); 2218 } 2219 2220 /* immediates only need to be uploaded again when freed */ 2221 nv50_program_upload_data(nv50, p->immd, p->data[0]->start, 2222 p->immd_nr, NV50_CB_PMISC); 2223 } 2224 2225 if (!p->data[1] && p->param_nr) { 2226 struct nouveau_resource *heap = 2227 nv50->screen->parm_heap[p->type]; 2228 2229 if (nvws->res_alloc(heap, p->param_nr, p, &p->data[1])) { 2230 while (heap->next && heap->size < p->param_nr) { 2231 struct nv50_program *evict = heap->next->priv; 2232 nvws->res_free(&evict->data[1]); 2233 } 2234 2235 if (nvws->res_alloc(heap, p->param_nr, p, &p->data[1])) 2236 assert(0); 2237 } 2238 } 2239 2240 if (p->param_nr) { 2241 unsigned cbuf = NV50_CB_PVP; 2242 float *map = ws->buffer_map(ws, nv50->constbuf[p->type], 2243 PIPE_BUFFER_USAGE_CPU_READ); 2244 if (p->type == PIPE_SHADER_FRAGMENT) 2245 cbuf = NV50_CB_PFP; 2246 nv50_program_upload_data(nv50, map, p->data[1]->start, 2247 p->param_nr, cbuf); 2248 ws->buffer_unmap(ws, nv50->constbuf[p->type]); 2249 } 2250} 2251 2252static void 2253nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) 2254{ 2255 struct nouveau_channel *chan = nv50->screen->nvws->channel; 2256 struct nouveau_grobj *tesla = nv50->screen->tesla; 2257 struct pipe_screen *screen = nv50->pipe.screen; 2258 struct nv50_program_exec *e; 2259 struct nouveau_stateobj *so; 2260 const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR; 2261 unsigned start, count, *up, *ptr; 2262 boolean upload = FALSE; 2263 2264 if (!p->buffer) { 2265 p->buffer = screen->buffer_create(screen, 0x100, 0, p->exec_size * 4); 2266 upload = TRUE; 2267 } 2268 2269 if ((p->data[0] && p->data[0]->start != p->data_start[0]) || 2270 (p->data[1] && p->data[1]->start != p->data_start[1])) { 2271 for (e = p->exec_head; e; e = e->next) { 2272 unsigned ei, ci, bs; 2273 2274 if (e->param.index < 0) 2275 continue; 2276 bs = (e->inst[1] >> 22) & 0x07; 2277 assert(bs < 2); 2278 ei = e->param.shift >> 5; 2279 ci = e->param.index + p->data[bs]->start; 2280 2281 e->inst[ei] &= ~e->param.mask; 2282 e->inst[ei] |= (ci << e->param.shift); 2283 } 2284 2285 if (p->data[0]) 2286 p->data_start[0] = p->data[0]->start; 2287 if (p->data[1]) 2288 p->data_start[1] = p->data[1]->start; 2289 2290 upload = TRUE; 2291 } 2292 2293 if (!upload) 2294 return; 2295 2296#ifdef NV50_PROGRAM_DUMP 2297 NOUVEAU_ERR("-------\n"); 2298 for (e = p->exec_head; e; e = e->next) { 2299 NOUVEAU_ERR("0x%08x\n", e->inst[0]); 2300 if (is_long(e)) 2301 NOUVEAU_ERR("0x%08x\n", e->inst[1]); 2302 } 2303#endif 2304 2305 up = ptr = MALLOC(p->exec_size * 4); 2306 for (e = p->exec_head; e; e = e->next) { 2307 *(ptr++) = e->inst[0]; 2308 if (is_long(e)) 2309 *(ptr++) = e->inst[1]; 2310 } 2311 2312 so = so_new(4,2); 2313 so_method(so, nv50->screen->tesla, 0x1280, 3); 2314 so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_HIGH, 0, 0); 2315 so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_LOW, 0, 0); 2316 so_data (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4)); 2317 2318 start = 0; count = p->exec_size; 2319 while (count) { 2320 struct nouveau_winsys *nvws = nv50->screen->nvws; 2321 unsigned nr; 2322 2323 so_emit(nvws, so); 2324 2325 nr = MIN2(count, 2047); 2326 nr = MIN2(nvws->channel->pushbuf->remaining, nr); 2327 if (nvws->channel->pushbuf->remaining < (nr + 3)) { 2328 FIRE_RING(chan); 2329 continue; 2330 } 2331 2332 BEGIN_RING(chan, tesla, 0x0f00, 1); 2333 OUT_RING (chan, (start << 8) | NV50_CB_PUPLOAD); 2334 BEGIN_RING(chan, tesla, 0x40000f04, nr); 2335 OUT_RINGp (chan, up + start, nr); 2336 2337 start += nr; 2338 count -= nr; 2339 } 2340 2341 FREE(up); 2342 so_ref(NULL, &so); 2343} 2344 2345void 2346nv50_vertprog_validate(struct nv50_context *nv50) 2347{ 2348 struct nouveau_grobj *tesla = nv50->screen->tesla; 2349 struct nv50_program *p = nv50->vertprog; 2350 struct nouveau_stateobj *so; 2351 2352 if (!p->translated) { 2353 nv50_program_validate(nv50, p); 2354 if (!p->translated) 2355 assert(0); 2356 } 2357 2358 nv50_program_validate_data(nv50, p); 2359 nv50_program_validate_code(nv50, p); 2360 2361 so = so_new(13, 2); 2362 so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2); 2363 so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 2364 NOUVEAU_BO_HIGH, 0, 0); 2365 so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 2366 NOUVEAU_BO_LOW, 0, 0); 2367 so_method(so, tesla, 0x1650, 2); 2368 so_data (so, p->cfg.vp.attr[0]); 2369 so_data (so, p->cfg.vp.attr[1]); 2370 so_method(so, tesla, 0x16b8, 1); 2371 so_data (so, p->cfg.high_result); 2372 so_method(so, tesla, 0x16ac, 2); 2373 so_data (so, p->cfg.high_result); //8); 2374 so_data (so, p->cfg.high_temp); 2375 so_method(so, tesla, 0x140c, 1); 2376 so_data (so, 0); /* program start offset */ 2377 so_ref(so, &nv50->state.vertprog); 2378 so_ref(NULL, &so); 2379} 2380 2381void 2382nv50_fragprog_validate(struct nv50_context *nv50) 2383{ 2384 struct nouveau_grobj *tesla = nv50->screen->tesla; 2385 struct nv50_program *p = nv50->fragprog; 2386 struct nouveau_stateobj *so; 2387 unsigned i; 2388 2389 if (!p->translated) { 2390 nv50_program_validate(nv50, p); 2391 if (!p->translated) 2392 assert(0); 2393 } 2394 2395 nv50_program_validate_data(nv50, p); 2396 nv50_program_validate_code(nv50, p); 2397 2398 so = so_new(64, 2); 2399 so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2); 2400 so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 2401 NOUVEAU_BO_HIGH, 0, 0); 2402 so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 2403 NOUVEAU_BO_LOW, 0, 0); 2404 so_method(so, tesla, 0x1904, 4); 2405 so_data (so, p->cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 */ 2406 so_data (so, 0x00000004); 2407 so_data (so, 0x00000000); 2408 so_data (so, 0x00000000); 2409 so_method(so, tesla, 0x16bc, p->cfg.fp.high_map); 2410 for (i = 0; i < p->cfg.fp.high_map; i++) 2411 so_data(so, p->cfg.fp.map[i]); 2412 so_method(so, tesla, 0x1988, 2); 2413 so_data (so, p->cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 */ 2414 so_data (so, p->cfg.high_temp); 2415 so_method(so, tesla, 0x1298, 1); 2416 so_data (so, p->cfg.high_result); 2417 so_method(so, tesla, 0x19a8, 1); 2418 so_data (so, p->cfg.fp.regs[2]); 2419 so_method(so, tesla, 0x196c, 1); 2420 so_data (so, p->cfg.fp.regs[3]); 2421 so_method(so, tesla, 0x1414, 1); 2422 so_data (so, 0); /* program start offset */ 2423 so_ref(so, &nv50->state.fragprog); 2424 so_ref(NULL, &so); 2425} 2426 2427void 2428nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p) 2429{ 2430 struct pipe_screen *pscreen = nv50->pipe.screen; 2431 2432 while (p->exec_head) { 2433 struct nv50_program_exec *e = p->exec_head; 2434 2435 p->exec_head = e->next; 2436 FREE(e); 2437 } 2438 p->exec_tail = NULL; 2439 p->exec_size = 0; 2440 2441 if (p->buffer) 2442 pipe_buffer_reference(&p->buffer, NULL); 2443 2444 nv50->screen->nvws->res_free(&p->data[0]); 2445 nv50->screen->nvws->res_free(&p->data[1]); 2446 2447 p->translated = 0; 2448} 2449 2450