ir3_compiler.c revision ed48f91275f52f26b513fc2970233063bfa023af
1/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ 2 3/* 4 * Copyright (C) 2013 Rob Clark <robclark@freedesktop.org> 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice (including the next 14 * paragraph) shall be included in all copies or substantial portions of the 15 * Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 * SOFTWARE. 24 * 25 * Authors: 26 * Rob Clark <robclark@freedesktop.org> 27 */ 28 29#include <stdarg.h> 30 31#include "pipe/p_state.h" 32#include "util/u_string.h" 33#include "util/u_memory.h" 34#include "util/u_inlines.h" 35#include "tgsi/tgsi_parse.h" 36#include "tgsi/tgsi_ureg.h" 37#include "tgsi/tgsi_info.h" 38#include "tgsi/tgsi_strings.h" 39#include "tgsi/tgsi_dump.h" 40#include "tgsi/tgsi_scan.h" 41 42#include "freedreno_lowering.h" 43#include "freedreno_util.h" 44 45#include "ir3_compiler.h" 46#include "ir3_shader.h" 47 48#include "instr-a3xx.h" 49#include "ir3.h" 50 51struct ir3_compile_context { 52 const struct tgsi_token *tokens; 53 bool free_tokens; 54 struct ir3 *ir; 55 struct ir3_shader_variant *so; 56 57 struct ir3_block *block; 58 struct ir3_instruction *current_instr; 59 60 /* we need to defer updates to block->outputs[] until the end 61 * of an instruction (so we don't see new value until *after* 62 * the src registers are processed) 63 */ 64 struct { 65 struct ir3_instruction *instr, **instrp; 66 } output_updates[16]; 67 unsigned num_output_updates; 68 69 /* are we in a sequence of "atomic" instructions? 70 */ 71 bool atomic; 72 73 /* For fragment shaders, from the hw perspective the only 74 * actual input is r0.xy position register passed to bary.f. 75 * But TGSI doesn't know that, it still declares things as 76 * IN[] registers. So we do all the input tracking normally 77 * and fix things up after compile_instructions() 78 * 79 * NOTE that frag_pos is the hardware position (possibly it 80 * is actually an index or tag or some such.. it is *not* 81 * values that can be directly used for gl_FragCoord..) 82 */ 83 struct ir3_instruction *frag_pos, *frag_face, *frag_coord[4]; 84 85 struct tgsi_parse_context parser; 86 unsigned type; 87 88 struct tgsi_shader_info info; 89 90 /* for calculating input/output positions/linkages: */ 91 unsigned next_inloc; 92 93 unsigned num_internal_temps; 94 struct tgsi_src_register internal_temps[6]; 95 96 /* idx/slot for last compiler generated immediate */ 97 unsigned immediate_idx; 98 99 /* stack of branch instructions that mark (potentially nested) 100 * branch if/else/loop/etc 101 */ 102 struct { 103 struct ir3_instruction *instr, *cond; 104 bool inv; /* true iff in else leg of branch */ 105 } branch[16]; 106 unsigned int branch_count; 107 108 /* list of kill instructions: */ 109 struct ir3_instruction *kill[16]; 110 unsigned int kill_count; 111 112 /* used when dst is same as one of the src, to avoid overwriting a 113 * src element before the remaining scalar instructions that make 114 * up the vector operation 115 */ 116 struct tgsi_dst_register tmp_dst; 117 struct tgsi_src_register *tmp_src; 118 119 /* just for catching incorrect use of get_dst()/put_dst(): 120 */ 121 bool using_tmp_dst; 122}; 123 124 125static void vectorize(struct ir3_compile_context *ctx, 126 struct ir3_instruction *instr, struct tgsi_dst_register *dst, 127 int nsrcs, ...); 128static void create_mov(struct ir3_compile_context *ctx, 129 struct tgsi_dst_register *dst, struct tgsi_src_register *src); 130static type_t get_ftype(struct ir3_compile_context *ctx); 131 132static unsigned 133compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so, 134 const struct tgsi_token *tokens) 135{ 136 unsigned ret; 137 struct tgsi_shader_info *info = &ctx->info; 138 const struct fd_lowering_config lconfig = { 139 .color_two_side = so->key.color_two_side, 140 .lower_DST = true, 141 .lower_XPD = true, 142 .lower_SCS = true, 143 .lower_LRP = true, 144 .lower_FRC = true, 145 .lower_POW = true, 146 .lower_LIT = true, 147 .lower_EXP = true, 148 .lower_LOG = true, 149 .lower_DP4 = true, 150 .lower_DP3 = true, 151 .lower_DPH = true, 152 .lower_DP2 = true, 153 .lower_DP2A = true, 154 }; 155 156 ctx->tokens = fd_transform_lowering(&lconfig, tokens, &ctx->info); 157 ctx->free_tokens = !!ctx->tokens; 158 if (!ctx->tokens) { 159 /* no lowering */ 160 ctx->tokens = tokens; 161 } 162 ctx->ir = so->ir; 163 ctx->so = so; 164 ctx->next_inloc = 8; 165 ctx->num_internal_temps = 0; 166 ctx->branch_count = 0; 167 ctx->kill_count = 0; 168 ctx->block = NULL; 169 ctx->current_instr = NULL; 170 ctx->num_output_updates = 0; 171 ctx->atomic = false; 172 ctx->frag_pos = NULL; 173 ctx->frag_face = NULL; 174 ctx->tmp_src = NULL; 175 ctx->using_tmp_dst = false; 176 177 memset(ctx->frag_coord, 0, sizeof(ctx->frag_coord)); 178 179#define FM(x) (1 << TGSI_FILE_##x) 180 /* optimize can't deal with relative addressing: */ 181 if (info->indirect_files & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT))) 182 return TGSI_PARSE_ERROR; 183 184 /* NOTE: if relative addressing is used, we set constlen in 185 * the compiler (to worst-case value) since we don't know in 186 * the assembler what the max addr reg value can be: 187 */ 188 if (info->indirect_files & FM(CONSTANT)) 189 so->constlen = 4 * (ctx->info.file_max[TGSI_FILE_CONSTANT] + 1); 190 191 /* Immediates go after constants: */ 192 so->first_immediate = info->file_max[TGSI_FILE_CONSTANT] + 1; 193 ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1); 194 195 ret = tgsi_parse_init(&ctx->parser, ctx->tokens); 196 if (ret != TGSI_PARSE_OK) 197 return ret; 198 199 ctx->type = ctx->parser.FullHeader.Processor.Processor; 200 201 return ret; 202} 203 204static void 205compile_error(struct ir3_compile_context *ctx, const char *format, ...) 206{ 207 va_list ap; 208 va_start(ap, format); 209 _debug_vprintf(format, ap); 210 va_end(ap); 211 tgsi_dump(ctx->tokens, 0); 212 debug_assert(0); 213} 214 215#define compile_assert(ctx, cond) do { \ 216 if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \ 217 } while (0) 218 219static void 220compile_free(struct ir3_compile_context *ctx) 221{ 222 if (ctx->free_tokens) 223 free((void *)ctx->tokens); 224 tgsi_parse_free(&ctx->parser); 225} 226 227struct instr_translater { 228 void (*fxn)(const struct instr_translater *t, 229 struct ir3_compile_context *ctx, 230 struct tgsi_full_instruction *inst); 231 unsigned tgsi_opc; 232 opc_t opc; 233 opc_t hopc; /* opc to use for half_precision mode, if different */ 234 unsigned arg; 235}; 236 237static void 238instr_finish(struct ir3_compile_context *ctx) 239{ 240 unsigned i; 241 242 if (ctx->atomic) 243 return; 244 245 for (i = 0; i < ctx->num_output_updates; i++) 246 *(ctx->output_updates[i].instrp) = ctx->output_updates[i].instr; 247 248 ctx->num_output_updates = 0; 249} 250 251/* For "atomic" groups of instructions, for example the four scalar 252 * instructions to perform a vec4 operation. Basically this just 253 * blocks out handling of output_updates so the next scalar instruction 254 * still sees the result from before the start of the atomic group. 255 * 256 * NOTE: when used properly, this could probably replace get/put_dst() 257 * stuff. 258 */ 259static void 260instr_atomic_start(struct ir3_compile_context *ctx) 261{ 262 ctx->atomic = true; 263} 264 265static void 266instr_atomic_end(struct ir3_compile_context *ctx) 267{ 268 ctx->atomic = false; 269 instr_finish(ctx); 270} 271 272static struct ir3_instruction * 273instr_create(struct ir3_compile_context *ctx, int category, opc_t opc) 274{ 275 instr_finish(ctx); 276 return (ctx->current_instr = ir3_instr_create(ctx->block, category, opc)); 277} 278 279static struct ir3_instruction * 280instr_clone(struct ir3_compile_context *ctx, struct ir3_instruction *instr) 281{ 282 instr_finish(ctx); 283 return (ctx->current_instr = ir3_instr_clone(instr)); 284} 285 286static struct ir3_block * 287push_block(struct ir3_compile_context *ctx) 288{ 289 struct ir3_block *block; 290 unsigned ntmp, nin, nout; 291 292#define SCALAR_REGS(file) (4 * (ctx->info.file_max[TGSI_FILE_ ## file] + 1)) 293 294 /* hmm, give ourselves room to create 4 extra temporaries (vec4): 295 */ 296 ntmp = SCALAR_REGS(TEMPORARY); 297 ntmp += 4 * 4; 298 299 nout = SCALAR_REGS(OUTPUT); 300 nin = SCALAR_REGS(INPUT); 301 302 /* for outermost block, 'inputs' are the actual shader INPUT 303 * register file. Reads from INPUT registers always go back to 304 * top block. For nested blocks, 'inputs' is used to track any 305 * TEMPORARY file register from one of the enclosing blocks that 306 * is ready in this block. 307 */ 308 if (!ctx->block) { 309 /* NOTE: fragment shaders actually have two inputs (r0.xy, the 310 * position) 311 */ 312 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { 313 int n = 2; 314 if (ctx->info.reads_position) 315 n += 4; 316 if (ctx->info.uses_frontface) 317 n += 4; 318 nin = MAX2(n, nin); 319 nout += ARRAY_SIZE(ctx->kill); 320 } 321 } else { 322 nin = ntmp; 323 } 324 325 block = ir3_block_create(ctx->ir, ntmp, nin, nout); 326 327 if ((ctx->type == TGSI_PROCESSOR_FRAGMENT) && !ctx->block) 328 block->noutputs -= ARRAY_SIZE(ctx->kill); 329 330 block->parent = ctx->block; 331 ctx->block = block; 332 333 return block; 334} 335 336static void 337pop_block(struct ir3_compile_context *ctx) 338{ 339 ctx->block = ctx->block->parent; 340 compile_assert(ctx, ctx->block); 341} 342 343static struct ir3_instruction * 344create_output(struct ir3_block *block, struct ir3_instruction *instr, 345 unsigned n) 346{ 347 struct ir3_instruction *out; 348 349 out = ir3_instr_create(block, -1, OPC_META_OUTPUT); 350 out->inout.block = block; 351 ir3_reg_create(out, n, 0); 352 if (instr) 353 ir3_reg_create(out, 0, IR3_REG_SSA)->instr = instr; 354 355 return out; 356} 357 358static struct ir3_instruction * 359create_input(struct ir3_block *block, struct ir3_instruction *instr, 360 unsigned n) 361{ 362 struct ir3_instruction *in; 363 364 in = ir3_instr_create(block, -1, OPC_META_INPUT); 365 in->inout.block = block; 366 ir3_reg_create(in, n, 0); 367 if (instr) 368 ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr; 369 370 return in; 371} 372 373static struct ir3_instruction * 374block_input(struct ir3_block *block, unsigned n) 375{ 376 /* references to INPUT register file always go back up to 377 * top level: 378 */ 379 if (block->parent) 380 return block_input(block->parent, n); 381 return block->inputs[n]; 382} 383 384/* return temporary in scope, creating if needed meta-input node 385 * to track block inputs 386 */ 387static struct ir3_instruction * 388block_temporary(struct ir3_block *block, unsigned n) 389{ 390 /* references to TEMPORARY register file, find the nearest 391 * enclosing block which has already assigned this temporary, 392 * creating meta-input instructions along the way to keep 393 * track of block inputs 394 */ 395 if (block->parent && !block->temporaries[n]) { 396 /* if already have input for this block, reuse: */ 397 if (!block->inputs[n]) 398 block->inputs[n] = block_temporary(block->parent, n); 399 400 /* and create new input to return: */ 401 return create_input(block, block->inputs[n], n); 402 } 403 return block->temporaries[n]; 404} 405 406static struct ir3_instruction * 407create_immed(struct ir3_compile_context *ctx, float val) 408{ 409 /* NOTE: *don't* use instr_create() here! 410 */ 411 struct ir3_instruction *instr; 412 instr = ir3_instr_create(ctx->block, 1, 0); 413 instr->cat1.src_type = get_ftype(ctx); 414 instr->cat1.dst_type = get_ftype(ctx); 415 ir3_reg_create(instr, 0, 0); 416 ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = val; 417 return instr; 418} 419 420static void 421ssa_dst(struct ir3_compile_context *ctx, struct ir3_instruction *instr, 422 const struct tgsi_dst_register *dst, unsigned chan) 423{ 424 unsigned n = regid(dst->Index, chan); 425 unsigned idx = ctx->num_output_updates; 426 427 compile_assert(ctx, idx < ARRAY_SIZE(ctx->output_updates)); 428 429 /* NOTE: defer update of temporaries[idx] or output[idx] 430 * until instr_finish(), so that if the current instruction 431 * reads the same TEMP/OUT[] it gets the old value: 432 * 433 * bleh.. this might be a bit easier to just figure out 434 * in instr_finish(). But at that point we've already 435 * lost information about OUTPUT vs TEMPORARY register 436 * file.. 437 */ 438 439 switch (dst->File) { 440 case TGSI_FILE_OUTPUT: 441 compile_assert(ctx, n < ctx->block->noutputs); 442 ctx->output_updates[idx].instrp = &ctx->block->outputs[n]; 443 ctx->output_updates[idx].instr = instr; 444 ctx->num_output_updates++; 445 break; 446 case TGSI_FILE_TEMPORARY: 447 compile_assert(ctx, n < ctx->block->ntemporaries); 448 ctx->output_updates[idx].instrp = &ctx->block->temporaries[n]; 449 ctx->output_updates[idx].instr = instr; 450 ctx->num_output_updates++; 451 break; 452 case TGSI_FILE_ADDRESS: 453 compile_assert(ctx, n < 1); 454 ctx->output_updates[idx].instrp = &ctx->block->address; 455 ctx->output_updates[idx].instr = instr; 456 ctx->num_output_updates++; 457 break; 458 } 459} 460 461static void 462ssa_src(struct ir3_compile_context *ctx, struct ir3_register *reg, 463 const struct tgsi_src_register *src, unsigned chan) 464{ 465 struct ir3_block *block = ctx->block; 466 unsigned n = regid(src->Index, chan); 467 468 switch (src->File) { 469 case TGSI_FILE_INPUT: 470 reg->flags |= IR3_REG_SSA; 471 reg->instr = block_input(ctx->block, n); 472 break; 473 case TGSI_FILE_OUTPUT: 474 /* really this should just happen in case of 'MOV_SAT OUT[n], ..', 475 * for the following clamp instructions: 476 */ 477 reg->flags |= IR3_REG_SSA; 478 reg->instr = block->outputs[n]; 479 /* we don't have to worry about read from an OUTPUT that was 480 * assigned outside of the current block, because the _SAT 481 * clamp instructions will always be in the same block as 482 * the original instruction which wrote the OUTPUT 483 */ 484 compile_assert(ctx, reg->instr); 485 break; 486 case TGSI_FILE_TEMPORARY: 487 reg->flags |= IR3_REG_SSA; 488 reg->instr = block_temporary(ctx->block, n); 489 break; 490 } 491 492 if ((reg->flags & IR3_REG_SSA) && !reg->instr) { 493 /* this can happen when registers (or components of a TGSI 494 * register) are used as src before they have been assigned 495 * (undefined contents). To avoid confusing the rest of the 496 * compiler, and to generally keep things peachy, substitute 497 * an instruction that sets the src to 0.0. Or to keep 498 * things undefined, I could plug in a random number? :-P 499 * 500 * NOTE: *don't* use instr_create() here! 501 */ 502 reg->instr = create_immed(ctx, 0.0); 503 } 504} 505 506static struct ir3_register * 507add_dst_reg_wrmask(struct ir3_compile_context *ctx, 508 struct ir3_instruction *instr, const struct tgsi_dst_register *dst, 509 unsigned chan, unsigned wrmask) 510{ 511 unsigned flags = 0, num = 0; 512 struct ir3_register *reg; 513 514 switch (dst->File) { 515 case TGSI_FILE_OUTPUT: 516 case TGSI_FILE_TEMPORARY: 517 /* uses SSA */ 518 break; 519 case TGSI_FILE_ADDRESS: 520 flags |= IR3_REG_ADDR; 521 /* uses SSA */ 522 break; 523 default: 524 compile_error(ctx, "unsupported dst register file: %s\n", 525 tgsi_file_name(dst->File)); 526 break; 527 } 528 529 if (dst->Indirect) 530 flags |= IR3_REG_RELATIV; 531 532 reg = ir3_reg_create(instr, regid(num, chan), flags); 533 534 /* NOTE: do not call ssa_dst() if atomic.. vectorize() 535 * itself will call ssa_dst(). This is to filter out 536 * the (initially bogus) .x component dst which is 537 * created (but not necessarily used, ie. if the net 538 * result of the vector operation does not write to 539 * the .x component) 540 */ 541 542 reg->wrmask = wrmask; 543 if (wrmask == 0x1) { 544 /* normal case */ 545 if (!ctx->atomic) 546 ssa_dst(ctx, instr, dst, chan); 547 } else if ((dst->File == TGSI_FILE_TEMPORARY) || 548 (dst->File == TGSI_FILE_OUTPUT) || 549 (dst->File == TGSI_FILE_ADDRESS)) { 550 unsigned i; 551 552 /* if instruction writes multiple, we need to create 553 * some place-holder collect the registers: 554 */ 555 for (i = 0; i < 4; i++) { 556 if (wrmask & (1 << i)) { 557 struct ir3_instruction *collect = 558 ir3_instr_create(ctx->block, -1, OPC_META_FO); 559 collect->fo.off = i; 560 /* unused dst reg: */ 561 ir3_reg_create(collect, 0, 0); 562 /* and src reg used to hold original instr */ 563 ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = instr; 564 if (!ctx->atomic) 565 ssa_dst(ctx, collect, dst, chan+i); 566 } 567 } 568 } 569 570 return reg; 571} 572 573static struct ir3_register * 574add_dst_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr, 575 const struct tgsi_dst_register *dst, unsigned chan) 576{ 577 return add_dst_reg_wrmask(ctx, instr, dst, chan, 0x1); 578} 579 580static struct ir3_register * 581add_src_reg_wrmask(struct ir3_compile_context *ctx, 582 struct ir3_instruction *instr, const struct tgsi_src_register *src, 583 unsigned chan, unsigned wrmask) 584{ 585 unsigned flags = 0, num = 0; 586 struct ir3_register *reg; 587 struct ir3_instruction *orig = NULL; 588 589 /* TODO we need to use a mov to temp for const >= 64.. or maybe 590 * we could use relative addressing.. 591 */ 592 compile_assert(ctx, src->Index < 64); 593 594 switch (src->File) { 595 case TGSI_FILE_IMMEDIATE: 596 /* TODO if possible, use actual immediate instead of const.. but 597 * TGSI has vec4 immediates, we can only embed scalar (of limited 598 * size, depending on instruction..) 599 */ 600 flags |= IR3_REG_CONST; 601 num = src->Index + ctx->so->first_immediate; 602 break; 603 case TGSI_FILE_CONSTANT: 604 flags |= IR3_REG_CONST; 605 num = src->Index; 606 break; 607 case TGSI_FILE_OUTPUT: 608 /* NOTE: we should only end up w/ OUTPUT file for things like 609 * clamp()'ing saturated dst instructions 610 */ 611 case TGSI_FILE_INPUT: 612 case TGSI_FILE_TEMPORARY: 613 /* uses SSA */ 614 break; 615 default: 616 compile_error(ctx, "unsupported src register file: %s\n", 617 tgsi_file_name(src->File)); 618 break; 619 } 620 621 if (src->Absolute) 622 flags |= IR3_REG_ABS; 623 if (src->Negate) 624 flags |= IR3_REG_NEGATE; 625 626 if (src->Indirect) { 627 flags |= IR3_REG_RELATIV; 628 629 /* shouldn't happen, and we can't cope with it below: */ 630 compile_assert(ctx, wrmask == 0x1); 631 632 /* wrap in a meta-deref to track both the src and address: */ 633 orig = instr; 634 635 instr = ir3_instr_create(ctx->block, -1, OPC_META_DEREF); 636 ir3_reg_create(instr, 0, 0); 637 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->block->address; 638 } 639 640 reg = ir3_reg_create(instr, regid(num, chan), flags); 641 642 reg->wrmask = wrmask; 643 if (wrmask == 0x1) { 644 /* normal case */ 645 ssa_src(ctx, reg, src, chan); 646 } else if ((src->File == TGSI_FILE_TEMPORARY) || 647 (src->File == TGSI_FILE_OUTPUT) || 648 (src->File == TGSI_FILE_INPUT)) { 649 struct ir3_instruction *collect; 650 unsigned i; 651 652 compile_assert(ctx, !src->Indirect); 653 654 /* if instruction reads multiple, we need to create 655 * some place-holder collect the registers: 656 */ 657 collect = ir3_instr_create(ctx->block, -1, OPC_META_FI); 658 ir3_reg_create(collect, 0, 0); /* unused dst reg */ 659 660 for (i = 0; i < 4; i++) { 661 if (wrmask & (1 << i)) { 662 /* and src reg used point to the original instr */ 663 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), 664 src, chan + i); 665 } else if (wrmask & ~((i << i) - 1)) { 666 /* if any remaining components, then dummy 667 * placeholder src reg to fill in the blanks: 668 */ 669 ir3_reg_create(collect, 0, 0); 670 } 671 } 672 673 reg->flags |= IR3_REG_SSA; 674 reg->instr = collect; 675 } 676 677 if (src->Indirect) { 678 reg = ir3_reg_create(orig, 0, flags | IR3_REG_SSA); 679 reg->instr = instr; 680 } 681 return reg; 682} 683 684static struct ir3_register * 685add_src_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr, 686 const struct tgsi_src_register *src, unsigned chan) 687{ 688 return add_src_reg_wrmask(ctx, instr, src, chan, 0x1); 689} 690 691static void 692src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst) 693{ 694 src->File = dst->File; 695 src->Indirect = dst->Indirect; 696 src->Dimension = dst->Dimension; 697 src->Index = dst->Index; 698 src->Absolute = 0; 699 src->Negate = 0; 700 src->SwizzleX = TGSI_SWIZZLE_X; 701 src->SwizzleY = TGSI_SWIZZLE_Y; 702 src->SwizzleZ = TGSI_SWIZZLE_Z; 703 src->SwizzleW = TGSI_SWIZZLE_W; 704} 705 706/* Get internal-temp src/dst to use for a sequence of instructions 707 * generated by a single TGSI op. 708 */ 709static struct tgsi_src_register * 710get_internal_temp(struct ir3_compile_context *ctx, 711 struct tgsi_dst_register *tmp_dst) 712{ 713 struct tgsi_src_register *tmp_src; 714 int n; 715 716 tmp_dst->File = TGSI_FILE_TEMPORARY; 717 tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW; 718 tmp_dst->Indirect = 0; 719 tmp_dst->Dimension = 0; 720 721 /* assign next temporary: */ 722 n = ctx->num_internal_temps++; 723 compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps)); 724 tmp_src = &ctx->internal_temps[n]; 725 726 tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1; 727 728 src_from_dst(tmp_src, tmp_dst); 729 730 return tmp_src; 731} 732 733static inline bool 734is_const(struct tgsi_src_register *src) 735{ 736 return (src->File == TGSI_FILE_CONSTANT) || 737 (src->File == TGSI_FILE_IMMEDIATE); 738} 739 740static inline bool 741is_relative(struct tgsi_src_register *src) 742{ 743 return src->Indirect; 744} 745 746static inline bool 747is_rel_or_const(struct tgsi_src_register *src) 748{ 749 return is_relative(src) || is_const(src); 750} 751 752static type_t 753get_ftype(struct ir3_compile_context *ctx) 754{ 755 return TYPE_F32; 756} 757 758static type_t 759get_utype(struct ir3_compile_context *ctx) 760{ 761 return TYPE_U32; 762} 763 764static unsigned 765src_swiz(struct tgsi_src_register *src, int chan) 766{ 767 switch (chan) { 768 case 0: return src->SwizzleX; 769 case 1: return src->SwizzleY; 770 case 2: return src->SwizzleZ; 771 case 3: return src->SwizzleW; 772 } 773 assert(0); 774 return 0; 775} 776 777/* for instructions that cannot take a const register as src, if needed 778 * generate a move to temporary gpr: 779 */ 780static struct tgsi_src_register * 781get_unconst(struct ir3_compile_context *ctx, struct tgsi_src_register *src) 782{ 783 struct tgsi_dst_register tmp_dst; 784 struct tgsi_src_register *tmp_src; 785 786 compile_assert(ctx, is_rel_or_const(src)); 787 788 tmp_src = get_internal_temp(ctx, &tmp_dst); 789 790 create_mov(ctx, &tmp_dst, src); 791 792 return tmp_src; 793} 794 795static void 796get_immediate(struct ir3_compile_context *ctx, 797 struct tgsi_src_register *reg, uint32_t val) 798{ 799 unsigned neg, swiz, idx, i; 800 /* actually maps 1:1 currently.. not sure if that is safe to rely on: */ 801 static const unsigned swiz2tgsi[] = { 802 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W, 803 }; 804 805 for (i = 0; i < ctx->immediate_idx; i++) { 806 swiz = i % 4; 807 idx = i / 4; 808 809 if (ctx->so->immediates[idx].val[swiz] == val) { 810 neg = 0; 811 break; 812 } 813 814 if (ctx->so->immediates[idx].val[swiz] == -val) { 815 neg = 1; 816 break; 817 } 818 } 819 820 if (i == ctx->immediate_idx) { 821 /* need to generate a new immediate: */ 822 swiz = i % 4; 823 idx = i / 4; 824 neg = 0; 825 ctx->so->immediates[idx].val[swiz] = val; 826 ctx->so->immediates_count = idx + 1; 827 ctx->immediate_idx++; 828 } 829 830 reg->File = TGSI_FILE_IMMEDIATE; 831 reg->Indirect = 0; 832 reg->Dimension = 0; 833 reg->Index = idx; 834 reg->Absolute = 0; 835 reg->Negate = neg; 836 reg->SwizzleX = swiz2tgsi[swiz]; 837 reg->SwizzleY = swiz2tgsi[swiz]; 838 reg->SwizzleZ = swiz2tgsi[swiz]; 839 reg->SwizzleW = swiz2tgsi[swiz]; 840} 841 842static void 843create_mov(struct ir3_compile_context *ctx, struct tgsi_dst_register *dst, 844 struct tgsi_src_register *src) 845{ 846 type_t type_mov = get_ftype(ctx); 847 unsigned i; 848 849 for (i = 0; i < 4; i++) { 850 /* move to destination: */ 851 if (dst->WriteMask & (1 << i)) { 852 struct ir3_instruction *instr; 853 854 if (src->Absolute || src->Negate) { 855 /* can't have abs or neg on a mov instr, so use 856 * absneg.f instead to handle these cases: 857 */ 858 instr = instr_create(ctx, 2, OPC_ABSNEG_F); 859 } else { 860 instr = instr_create(ctx, 1, 0); 861 instr->cat1.src_type = type_mov; 862 instr->cat1.dst_type = type_mov; 863 } 864 865 add_dst_reg(ctx, instr, dst, i); 866 add_src_reg(ctx, instr, src, src_swiz(src, i)); 867 } 868 } 869} 870 871static void 872create_clamp(struct ir3_compile_context *ctx, 873 struct tgsi_dst_register *dst, struct tgsi_src_register *val, 874 struct tgsi_src_register *minval, struct tgsi_src_register *maxval) 875{ 876 struct ir3_instruction *instr; 877 878 instr = instr_create(ctx, 2, OPC_MAX_F); 879 vectorize(ctx, instr, dst, 2, val, 0, minval, 0); 880 881 instr = instr_create(ctx, 2, OPC_MIN_F); 882 vectorize(ctx, instr, dst, 2, val, 0, maxval, 0); 883} 884 885static void 886create_clamp_imm(struct ir3_compile_context *ctx, 887 struct tgsi_dst_register *dst, 888 uint32_t minval, uint32_t maxval) 889{ 890 struct tgsi_src_register minconst, maxconst; 891 struct tgsi_src_register src; 892 893 src_from_dst(&src, dst); 894 895 get_immediate(ctx, &minconst, minval); 896 get_immediate(ctx, &maxconst, maxval); 897 898 create_clamp(ctx, dst, &src, &minconst, &maxconst); 899} 900 901static struct tgsi_dst_register * 902get_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst) 903{ 904 struct tgsi_dst_register *dst = &inst->Dst[0].Register; 905 unsigned i; 906 907 compile_assert(ctx, !ctx->using_tmp_dst); 908 ctx->using_tmp_dst = true; 909 910 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 911 struct tgsi_src_register *src = &inst->Src[i].Register; 912 if ((src->File == dst->File) && (src->Index == dst->Index)) { 913 if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) && 914 (src->SwizzleX == TGSI_SWIZZLE_X) && 915 (src->SwizzleY == TGSI_SWIZZLE_Y) && 916 (src->SwizzleZ == TGSI_SWIZZLE_Z) && 917 (src->SwizzleW == TGSI_SWIZZLE_W)) 918 continue; 919 ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst); 920 ctx->tmp_dst.WriteMask = dst->WriteMask; 921 dst = &ctx->tmp_dst; 922 break; 923 } 924 } 925 return dst; 926} 927 928static void 929put_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst, 930 struct tgsi_dst_register *dst) 931{ 932 compile_assert(ctx, ctx->using_tmp_dst); 933 ctx->using_tmp_dst = false; 934 935 /* if necessary, add mov back into original dst: */ 936 if (dst != &inst->Dst[0].Register) { 937 create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src); 938 } 939} 940 941/* helper to generate the necessary repeat and/or additional instructions 942 * to turn a scalar instruction into a vector operation: 943 */ 944static void 945vectorize(struct ir3_compile_context *ctx, struct ir3_instruction *instr, 946 struct tgsi_dst_register *dst, int nsrcs, ...) 947{ 948 va_list ap; 949 int i, j, n = 0; 950 951 instr_atomic_start(ctx); 952 953 add_dst_reg(ctx, instr, dst, TGSI_SWIZZLE_X); 954 955 va_start(ap, nsrcs); 956 for (j = 0; j < nsrcs; j++) { 957 struct tgsi_src_register *src = 958 va_arg(ap, struct tgsi_src_register *); 959 unsigned flags = va_arg(ap, unsigned); 960 struct ir3_register *reg; 961 if (flags & IR3_REG_IMMED) { 962 reg = ir3_reg_create(instr, 0, IR3_REG_IMMED); 963 /* this is an ugly cast.. should have put flags first! */ 964 reg->iim_val = *(int *)&src; 965 } else { 966 reg = add_src_reg(ctx, instr, src, TGSI_SWIZZLE_X); 967 } 968 reg->flags |= flags & ~IR3_REG_NEGATE; 969 if (flags & IR3_REG_NEGATE) 970 reg->flags ^= IR3_REG_NEGATE; 971 } 972 va_end(ap); 973 974 for (i = 0; i < 4; i++) { 975 if (dst->WriteMask & (1 << i)) { 976 struct ir3_instruction *cur; 977 978 if (n++ == 0) { 979 cur = instr; 980 } else { 981 cur = instr_clone(ctx, instr); 982 } 983 984 ssa_dst(ctx, cur, dst, i); 985 986 /* fix-up dst register component: */ 987 cur->regs[0]->num = regid(cur->regs[0]->num >> 2, i); 988 989 /* fix-up src register component: */ 990 va_start(ap, nsrcs); 991 for (j = 0; j < nsrcs; j++) { 992 struct ir3_register *reg = cur->regs[j+1]; 993 struct tgsi_src_register *src = 994 va_arg(ap, struct tgsi_src_register *); 995 unsigned flags = va_arg(ap, unsigned); 996 if (reg->flags & IR3_REG_SSA) { 997 ssa_src(ctx, reg, src, src_swiz(src, i)); 998 } else if (!(flags & IR3_REG_IMMED)) { 999 reg->num = regid(reg->num >> 2, src_swiz(src, i)); 1000 } 1001 } 1002 va_end(ap); 1003 } 1004 } 1005 1006 instr_atomic_end(ctx); 1007} 1008 1009/* 1010 * Handlers for TGSI instructions which do not have a 1:1 mapping to 1011 * native instructions: 1012 */ 1013 1014static void 1015trans_clamp(const struct instr_translater *t, 1016 struct ir3_compile_context *ctx, 1017 struct tgsi_full_instruction *inst) 1018{ 1019 struct tgsi_dst_register *dst = get_dst(ctx, inst); 1020 struct tgsi_src_register *src0 = &inst->Src[0].Register; 1021 struct tgsi_src_register *src1 = &inst->Src[1].Register; 1022 struct tgsi_src_register *src2 = &inst->Src[2].Register; 1023 1024 create_clamp(ctx, dst, src0, src1, src2); 1025 1026 put_dst(ctx, inst, dst); 1027} 1028 1029/* ARL(x) = x, but mova from hrN.x to a0.. */ 1030static void 1031trans_arl(const struct instr_translater *t, 1032 struct ir3_compile_context *ctx, 1033 struct tgsi_full_instruction *inst) 1034{ 1035 struct ir3_instruction *instr; 1036 struct tgsi_dst_register tmp_dst; 1037 struct tgsi_src_register *tmp_src; 1038 struct tgsi_dst_register *dst = &inst->Dst[0].Register; 1039 struct tgsi_src_register *src = &inst->Src[0].Register; 1040 unsigned chan = src->SwizzleX; 1041 1042 compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS); 1043 1044 /* NOTE: we allocate a temporary from a flat register 1045 * namespace (ignoring half vs full). It turns out 1046 * not to really matter since registers get reassigned 1047 * later in ir3_ra which (hopefully!) can deal a bit 1048 * better with mixed half and full precision. 1049 */ 1050 tmp_src = get_internal_temp(ctx, &tmp_dst); 1051 1052 /* cov.{u,f}{32,16}s16 Rtmp, Rsrc */ 1053 instr = instr_create(ctx, 1, 0); 1054 instr->cat1.src_type = (t->tgsi_opc == TGSI_OPCODE_ARL) ? 1055 get_ftype(ctx) : get_utype(ctx); 1056 instr->cat1.dst_type = TYPE_S16; 1057 add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF; 1058 add_src_reg(ctx, instr, src, chan); 1059 1060 /* shl.b Rtmp, Rtmp, 2 */ 1061 instr = instr_create(ctx, 2, OPC_SHL_B); 1062 add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF; 1063 add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF; 1064 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2; 1065 1066 /* mova a0, Rtmp */ 1067 instr = instr_create(ctx, 1, 0); 1068 instr->cat1.src_type = TYPE_S16; 1069 instr->cat1.dst_type = TYPE_S16; 1070 add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF; 1071 add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF; 1072} 1073 1074/* 1075 * texture fetch/sample instructions: 1076 */ 1077 1078struct tex_info { 1079 int8_t order[4]; 1080 int8_t args; 1081 unsigned src_wrmask, flags; 1082}; 1083 1084struct target_info { 1085 uint8_t dims; 1086 uint8_t cube; 1087 uint8_t array; 1088 uint8_t shadow; 1089}; 1090 1091static const struct target_info tex_targets[] = { 1092 [TGSI_TEXTURE_1D] = { 1, 0, 0, 0 }, 1093 [TGSI_TEXTURE_2D] = { 2, 0, 0, 0 }, 1094 [TGSI_TEXTURE_3D] = { 3, 0, 0, 0 }, 1095 [TGSI_TEXTURE_CUBE] = { 3, 1, 0, 0 }, 1096 [TGSI_TEXTURE_RECT] = { 2, 0, 0, 0 }, 1097 [TGSI_TEXTURE_SHADOW1D] = { 1, 0, 0, 1 }, 1098 [TGSI_TEXTURE_SHADOW2D] = { 2, 0, 0, 1 }, 1099 [TGSI_TEXTURE_SHADOWRECT] = { 2, 0, 0, 1 }, 1100 [TGSI_TEXTURE_1D_ARRAY] = { 1, 0, 1, 0 }, 1101 [TGSI_TEXTURE_2D_ARRAY] = { 2, 0, 1, 0 }, 1102 [TGSI_TEXTURE_SHADOW1D_ARRAY] = { 1, 0, 1, 1 }, 1103 [TGSI_TEXTURE_SHADOW2D_ARRAY] = { 2, 0, 1, 1 }, 1104 [TGSI_TEXTURE_SHADOWCUBE] = { 3, 1, 0, 1 }, 1105 [TGSI_TEXTURE_2D_MSAA] = { 2, 0, 0, 0 }, 1106 [TGSI_TEXTURE_2D_ARRAY_MSAA] = { 2, 0, 1, 0 }, 1107 [TGSI_TEXTURE_CUBE_ARRAY] = { 3, 1, 1, 0 }, 1108 [TGSI_TEXTURE_SHADOWCUBE_ARRAY] = { 3, 1, 1, 1 }, 1109}; 1110 1111static void 1112fill_tex_info(struct ir3_compile_context *ctx, 1113 struct tgsi_full_instruction *inst, 1114 struct tex_info *info) 1115{ 1116 const struct target_info *tgt = &tex_targets[inst->Texture.Texture]; 1117 1118 if (tgt->dims == 3) 1119 info->flags |= IR3_INSTR_3D; 1120 if (tgt->array) 1121 info->flags |= IR3_INSTR_A; 1122 if (tgt->shadow) 1123 info->flags |= IR3_INSTR_S; 1124 1125 switch (inst->Instruction.Opcode) { 1126 case TGSI_OPCODE_TXB: 1127 case TGSI_OPCODE_TXB2: 1128 case TGSI_OPCODE_TXL: 1129 info->args = 2; 1130 break; 1131 case TGSI_OPCODE_TXP: 1132 info->flags |= IR3_INSTR_P; 1133 /* fallthrough */ 1134 case TGSI_OPCODE_TEX: 1135 info->args = 1; 1136 break; 1137 } 1138 1139 /* 1140 * lay out the first argument in the proper order: 1141 * - actual coordinates first 1142 * - array index 1143 * - shadow reference 1144 * - projection w 1145 * 1146 * bias/lod go into the second arg 1147 */ 1148 int arg, pos = 0; 1149 for (arg = 0; arg < tgt->dims; arg++) 1150 info->order[arg] = pos++; 1151 if (tgt->dims == 1) 1152 info->order[pos++] = -1; 1153 if (tgt->array) 1154 info->order[pos++] = arg++; 1155 if (tgt->shadow) 1156 info->order[pos++] = MAX2(arg, 2); 1157 if (info->flags & IR3_INSTR_P) 1158 info->order[pos++] = 3; 1159 1160 info->src_wrmask = (1 << pos) - 1; 1161 1162 for (; pos < 4; pos++) 1163 info->order[pos] = -1; 1164 1165 assert(pos <= 4); 1166} 1167 1168static bool check_swiz(struct tgsi_src_register *src, const int8_t order[4]) 1169{ 1170 unsigned i; 1171 for (i = 1; (i < 4) && order[i] >= 0; i++) 1172 if (src_swiz(src, i) != (src_swiz(src, 0) + order[i])) 1173 return false; 1174 return true; 1175} 1176 1177static bool is_1d(unsigned tex) 1178{ 1179 return tex_targets[tex].dims == 1; 1180} 1181 1182static struct tgsi_src_register * 1183get_tex_coord(struct ir3_compile_context *ctx, 1184 struct tgsi_full_instruction *inst, 1185 const struct tex_info *tinf) 1186{ 1187 struct tgsi_src_register *coord = &inst->Src[0].Register; 1188 struct ir3_instruction *instr; 1189 unsigned tex = inst->Texture.Texture; 1190 bool needs_mov = false; 1191 1192 /* cat5 instruction cannot seem to handle const or relative: */ 1193 if (is_rel_or_const(coord)) 1194 needs_mov = true; 1195 1196 /* 1D textures we fix up w/ 0.5 as 2nd coord: */ 1197 if (is_1d(tex)) 1198 needs_mov = true; 1199 1200 /* The texture sample instructions need to coord in successive 1201 * registers/components (ie. src.xy but not src.yx). And TXP 1202 * needs the .w component in .z for 2D.. so in some cases we 1203 * might need to emit some mov instructions to shuffle things 1204 * around: 1205 */ 1206 if (!needs_mov) 1207 needs_mov = !check_swiz(coord, tinf->order); 1208 1209 if (needs_mov) { 1210 struct tgsi_dst_register tmp_dst; 1211 struct tgsi_src_register *tmp_src; 1212 unsigned j; 1213 1214 type_t type_mov = get_ftype(ctx); 1215 1216 /* need to move things around: */ 1217 tmp_src = get_internal_temp(ctx, &tmp_dst); 1218 1219 for (j = 0; j < 4; j++) { 1220 if (tinf->order[j] < 0) 1221 continue; 1222 instr = instr_create(ctx, 1, 0); /* mov */ 1223 instr->cat1.src_type = type_mov; 1224 instr->cat1.dst_type = type_mov; 1225 add_dst_reg(ctx, instr, &tmp_dst, j); 1226 add_src_reg(ctx, instr, coord, 1227 src_swiz(coord, tinf->order[j])); 1228 } 1229 1230 /* fix up .y coord: */ 1231 if (is_1d(tex)) { 1232 instr = instr_create(ctx, 1, 0); /* mov */ 1233 instr->cat1.src_type = type_mov; 1234 instr->cat1.dst_type = type_mov; 1235 add_dst_reg(ctx, instr, &tmp_dst, 1); /* .y */ 1236 ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = 0.5; 1237 } 1238 1239 coord = tmp_src; 1240 } 1241 1242 return coord; 1243} 1244 1245static void 1246trans_samp(const struct instr_translater *t, 1247 struct ir3_compile_context *ctx, 1248 struct tgsi_full_instruction *inst) 1249{ 1250 struct ir3_instruction *instr; 1251 struct tgsi_dst_register *dst = &inst->Dst[0].Register; 1252 struct tgsi_src_register *orig, *coord, *samp; 1253 struct tex_info tinf; 1254 1255 memset(&tinf, 0, sizeof(tinf)); 1256 fill_tex_info(ctx, inst, &tinf); 1257 coord = get_tex_coord(ctx, inst, &tinf); 1258 1259 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2) { 1260 orig = &inst->Src[1].Register; 1261 samp = &inst->Src[2].Register; 1262 } else { 1263 orig = &inst->Src[0].Register; 1264 samp = &inst->Src[1].Register; 1265 } 1266 if (tinf.args > 1 && is_rel_or_const(orig)) 1267 orig = get_unconst(ctx, orig); 1268 1269 instr = instr_create(ctx, 5, t->opc); 1270 instr->cat5.type = get_ftype(ctx); 1271 instr->cat5.samp = samp->Index; 1272 instr->cat5.tex = samp->Index; 1273 instr->flags |= tinf.flags; 1274 1275 add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask); 1276 add_src_reg_wrmask(ctx, instr, coord, coord->SwizzleX, tinf.src_wrmask); 1277 1278 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2) 1279 add_src_reg_wrmask(ctx, instr, orig, orig->SwizzleX, 0x1); 1280 else if (tinf.args > 1) 1281 add_src_reg_wrmask(ctx, instr, orig, orig->SwizzleW, 0x1); 1282} 1283 1284static void 1285trans_txq(const struct instr_translater *t, 1286 struct ir3_compile_context *ctx, 1287 struct tgsi_full_instruction *inst) 1288{ 1289 struct ir3_instruction *instr; 1290 struct tgsi_dst_register *dst = &inst->Dst[0].Register; 1291 struct tgsi_src_register *level = &inst->Src[0].Register; 1292 struct tgsi_src_register *samp = &inst->Src[1].Register; 1293 struct tex_info tinf; 1294 1295 memset(&tinf, 0, sizeof(tinf)); 1296 fill_tex_info(ctx, inst, &tinf); 1297 if (is_rel_or_const(level)) 1298 level = get_unconst(ctx, level); 1299 1300 instr = instr_create(ctx, 5, OPC_GETSIZE); 1301 instr->cat5.type = get_ftype(ctx); 1302 instr->cat5.samp = samp->Index; 1303 instr->cat5.tex = samp->Index; 1304 instr->flags |= tinf.flags; 1305 1306 add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask); 1307 add_src_reg_wrmask(ctx, instr, level, level->SwizzleX, 0x1); 1308} 1309 1310/* DDX/DDY */ 1311static void 1312trans_deriv(const struct instr_translater *t, 1313 struct ir3_compile_context *ctx, 1314 struct tgsi_full_instruction *inst) 1315{ 1316 struct ir3_instruction *instr; 1317 struct tgsi_dst_register *dst = &inst->Dst[0].Register; 1318 struct tgsi_src_register *src = &inst->Src[0].Register; 1319 static const int8_t order[4] = {0, 1, 2, 3}; 1320 1321 if (!check_swiz(src, order)) { 1322 struct tgsi_dst_register tmp_dst; 1323 struct tgsi_src_register *tmp_src; 1324 1325 tmp_src = get_internal_temp(ctx, &tmp_dst); 1326 create_mov(ctx, &tmp_dst, src); 1327 1328 src = tmp_src; 1329 } 1330 1331 /* This might be a workaround for hw bug? Blob compiler always 1332 * seems to work two components at a time for dsy/dsx. It does 1333 * actually seem to work in some cases (or at least some piglit 1334 * tests) for four components at a time. But seems more reliable 1335 * to split this into two instructions like the blob compiler 1336 * does: 1337 */ 1338 1339 instr = instr_create(ctx, 5, t->opc); 1340 instr->cat5.type = get_ftype(ctx); 1341 add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask & 0x3); 1342 add_src_reg_wrmask(ctx, instr, src, 0, dst->WriteMask & 0x3); 1343 1344 instr = instr_create(ctx, 5, t->opc); 1345 instr->cat5.type = get_ftype(ctx); 1346 add_dst_reg_wrmask(ctx, instr, dst, 2, (dst->WriteMask >> 2) & 0x3); 1347 add_src_reg_wrmask(ctx, instr, src, 2, (dst->WriteMask >> 2) & 0x3); 1348} 1349 1350/* 1351 * SEQ(a,b) = (a == b) ? 1.0 : 0.0 1352 * cmps.f.eq tmp0, a, b 1353 * cov.u16f16 dst, tmp0 1354 * 1355 * SNE(a,b) = (a != b) ? 1.0 : 0.0 1356 * cmps.f.ne tmp0, a, b 1357 * cov.u16f16 dst, tmp0 1358 * 1359 * SGE(a,b) = (a >= b) ? 1.0 : 0.0 1360 * cmps.f.ge tmp0, a, b 1361 * cov.u16f16 dst, tmp0 1362 * 1363 * SLE(a,b) = (a <= b) ? 1.0 : 0.0 1364 * cmps.f.le tmp0, a, b 1365 * cov.u16f16 dst, tmp0 1366 * 1367 * SGT(a,b) = (a > b) ? 1.0 : 0.0 1368 * cmps.f.gt tmp0, a, b 1369 * cov.u16f16 dst, tmp0 1370 * 1371 * SLT(a,b) = (a < b) ? 1.0 : 0.0 1372 * cmps.f.lt tmp0, a, b 1373 * cov.u16f16 dst, tmp0 1374 * 1375 * CMP(a,b,c) = (a < 0.0) ? b : c 1376 * cmps.f.lt tmp0, a, {0.0} 1377 * sel.b16 dst, b, tmp0, c 1378 */ 1379static void 1380trans_cmp(const struct instr_translater *t, 1381 struct ir3_compile_context *ctx, 1382 struct tgsi_full_instruction *inst) 1383{ 1384 struct ir3_instruction *instr; 1385 struct tgsi_dst_register tmp_dst; 1386 struct tgsi_src_register *tmp_src; 1387 struct tgsi_src_register constval0; 1388 /* final instruction for CMP() uses orig src1 and src2: */ 1389 struct tgsi_dst_register *dst = get_dst(ctx, inst); 1390 struct tgsi_src_register *a0, *a1, *a2; 1391 unsigned condition; 1392 1393 tmp_src = get_internal_temp(ctx, &tmp_dst); 1394 1395 a0 = &inst->Src[0].Register; /* a */ 1396 a1 = &inst->Src[1].Register; /* b */ 1397 1398 switch (t->tgsi_opc) { 1399 case TGSI_OPCODE_SEQ: 1400 case TGSI_OPCODE_FSEQ: 1401 condition = IR3_COND_EQ; 1402 break; 1403 case TGSI_OPCODE_SNE: 1404 case TGSI_OPCODE_FSNE: 1405 condition = IR3_COND_NE; 1406 break; 1407 case TGSI_OPCODE_SGE: 1408 case TGSI_OPCODE_FSGE: 1409 condition = IR3_COND_GE; 1410 break; 1411 case TGSI_OPCODE_SLT: 1412 case TGSI_OPCODE_FSLT: 1413 condition = IR3_COND_LT; 1414 break; 1415 case TGSI_OPCODE_SLE: 1416 condition = IR3_COND_LE; 1417 break; 1418 case TGSI_OPCODE_SGT: 1419 condition = IR3_COND_GT; 1420 break; 1421 case TGSI_OPCODE_CMP: 1422 get_immediate(ctx, &constval0, fui(0.0)); 1423 a0 = &inst->Src[0].Register; /* a */ 1424 a1 = &constval0; /* {0.0} */ 1425 condition = IR3_COND_LT; 1426 break; 1427 default: 1428 compile_assert(ctx, 0); 1429 return; 1430 } 1431 1432 if (is_const(a0) && is_const(a1)) 1433 a0 = get_unconst(ctx, a0); 1434 1435 /* cmps.f.<cond> tmp, a0, a1 */ 1436 instr = instr_create(ctx, 2, OPC_CMPS_F); 1437 instr->cat2.condition = condition; 1438 vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0); 1439 1440 switch (t->tgsi_opc) { 1441 case TGSI_OPCODE_SEQ: 1442 case TGSI_OPCODE_SGE: 1443 case TGSI_OPCODE_SLE: 1444 case TGSI_OPCODE_SNE: 1445 case TGSI_OPCODE_SGT: 1446 case TGSI_OPCODE_SLT: 1447 /* cov.u16f16 dst, tmp0 */ 1448 instr = instr_create(ctx, 1, 0); 1449 instr->cat1.src_type = get_utype(ctx); 1450 instr->cat1.dst_type = get_ftype(ctx); 1451 vectorize(ctx, instr, dst, 1, tmp_src, 0); 1452 break; 1453 case TGSI_OPCODE_FSEQ: 1454 case TGSI_OPCODE_FSGE: 1455 case TGSI_OPCODE_FSNE: 1456 case TGSI_OPCODE_FSLT: 1457 /* absneg.s dst, (neg)tmp0 */ 1458 instr = instr_create(ctx, 2, OPC_ABSNEG_S); 1459 vectorize(ctx, instr, dst, 1, tmp_src, IR3_REG_NEGATE); 1460 break; 1461 case TGSI_OPCODE_CMP: 1462 a1 = &inst->Src[1].Register; 1463 a2 = &inst->Src[2].Register; 1464 /* sel.{b32,b16} dst, src2, tmp, src1 */ 1465 instr = instr_create(ctx, 3, OPC_SEL_B32); 1466 vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0); 1467 1468 break; 1469 } 1470 1471 put_dst(ctx, inst, dst); 1472} 1473 1474/* 1475 * USNE(a,b) = (a != b) ? ~0 : 0 1476 * cmps.u32.ne dst, a, b 1477 * 1478 * USEQ(a,b) = (a == b) ? ~0 : 0 1479 * cmps.u32.eq dst, a, b 1480 * 1481 * ISGE(a,b) = (a > b) ? ~0 : 0 1482 * cmps.s32.ge dst, a, b 1483 * 1484 * USGE(a,b) = (a > b) ? ~0 : 0 1485 * cmps.u32.ge dst, a, b 1486 * 1487 * ISLT(a,b) = (a < b) ? ~0 : 0 1488 * cmps.s32.lt dst, a, b 1489 * 1490 * USLT(a,b) = (a < b) ? ~0 : 0 1491 * cmps.u32.lt dst, a, b 1492 * 1493 */ 1494static void 1495trans_icmp(const struct instr_translater *t, 1496 struct ir3_compile_context *ctx, 1497 struct tgsi_full_instruction *inst) 1498{ 1499 struct ir3_instruction *instr; 1500 struct tgsi_dst_register *dst = get_dst(ctx, inst); 1501 struct tgsi_dst_register tmp_dst; 1502 struct tgsi_src_register *tmp_src; 1503 struct tgsi_src_register *a0, *a1; 1504 unsigned condition; 1505 1506 a0 = &inst->Src[0].Register; /* a */ 1507 a1 = &inst->Src[1].Register; /* b */ 1508 1509 switch (t->tgsi_opc) { 1510 case TGSI_OPCODE_USNE: 1511 condition = IR3_COND_NE; 1512 break; 1513 case TGSI_OPCODE_USEQ: 1514 condition = IR3_COND_EQ; 1515 break; 1516 case TGSI_OPCODE_ISGE: 1517 case TGSI_OPCODE_USGE: 1518 condition = IR3_COND_GE; 1519 break; 1520 case TGSI_OPCODE_ISLT: 1521 case TGSI_OPCODE_USLT: 1522 condition = IR3_COND_LT; 1523 break; 1524 1525 default: 1526 compile_assert(ctx, 0); 1527 return; 1528 } 1529 1530 if (is_const(a0) && is_const(a1)) 1531 a0 = get_unconst(ctx, a0); 1532 1533 tmp_src = get_internal_temp(ctx, &tmp_dst); 1534 /* cmps.{u32,s32}.<cond> tmp, a0, a1 */ 1535 instr = instr_create(ctx, 2, t->opc); 1536 instr->cat2.condition = condition; 1537 vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0); 1538 1539 /* absneg.s dst, (neg)tmp */ 1540 instr = instr_create(ctx, 2, OPC_ABSNEG_S); 1541 vectorize(ctx, instr, dst, 1, tmp_src, IR3_REG_NEGATE); 1542 1543 put_dst(ctx, inst, dst); 1544} 1545 1546/* 1547 * UCMP(a,b,c) = a ? b : c 1548 * sel.b16 dst, b, a, c 1549 */ 1550static void 1551trans_ucmp(const struct instr_translater *t, 1552 struct ir3_compile_context *ctx, 1553 struct tgsi_full_instruction *inst) 1554{ 1555 struct ir3_instruction *instr; 1556 struct tgsi_dst_register *dst = get_dst(ctx, inst); 1557 struct tgsi_src_register *a0, *a1, *a2; 1558 1559 a0 = &inst->Src[0].Register; /* a */ 1560 a1 = &inst->Src[1].Register; /* b */ 1561 a2 = &inst->Src[2].Register; /* c */ 1562 1563 if (is_rel_or_const(a0)) 1564 a0 = get_unconst(ctx, a0); 1565 1566 /* sel.{b32,b16} dst, b, a, c */ 1567 instr = instr_create(ctx, 3, OPC_SEL_B32); 1568 vectorize(ctx, instr, dst, 3, a1, 0, a0, 0, a2, 0); 1569 put_dst(ctx, inst, dst); 1570} 1571 1572 1573/* 1574 * Conditional / Flow control 1575 */ 1576 1577static void 1578push_branch(struct ir3_compile_context *ctx, bool inv, 1579 struct ir3_instruction *instr, struct ir3_instruction *cond) 1580{ 1581 unsigned int idx = ctx->branch_count++; 1582 compile_assert(ctx, idx < ARRAY_SIZE(ctx->branch)); 1583 ctx->branch[idx].instr = instr; 1584 ctx->branch[idx].inv = inv; 1585 /* else side of branch has same condition: */ 1586 if (!inv) 1587 ctx->branch[idx].cond = cond; 1588} 1589 1590static struct ir3_instruction * 1591pop_branch(struct ir3_compile_context *ctx) 1592{ 1593 unsigned int idx = --ctx->branch_count; 1594 return ctx->branch[idx].instr; 1595} 1596 1597static void 1598trans_if(const struct instr_translater *t, 1599 struct ir3_compile_context *ctx, 1600 struct tgsi_full_instruction *inst) 1601{ 1602 struct ir3_instruction *instr, *cond; 1603 struct tgsi_src_register *src = &inst->Src[0].Register; 1604 struct tgsi_dst_register tmp_dst; 1605 struct tgsi_src_register *tmp_src; 1606 struct tgsi_src_register constval; 1607 1608 get_immediate(ctx, &constval, fui(0.0)); 1609 tmp_src = get_internal_temp(ctx, &tmp_dst); 1610 1611 if (is_const(src)) 1612 src = get_unconst(ctx, src); 1613 1614 /* cmps.{f,u}.ne tmp0, b, {0.0} */ 1615 instr = instr_create(ctx, 2, t->opc); 1616 add_dst_reg(ctx, instr, &tmp_dst, 0); 1617 add_src_reg(ctx, instr, src, src->SwizzleX); 1618 add_src_reg(ctx, instr, &constval, constval.SwizzleX); 1619 instr->cat2.condition = IR3_COND_NE; 1620 1621 compile_assert(ctx, instr->regs[1]->flags & IR3_REG_SSA); /* because get_unconst() */ 1622 cond = instr->regs[1]->instr; 1623 1624 /* meta:flow tmp0 */ 1625 instr = instr_create(ctx, -1, OPC_META_FLOW); 1626 ir3_reg_create(instr, 0, 0); /* dummy dst */ 1627 add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X); 1628 1629 push_branch(ctx, false, instr, cond); 1630 instr->flow.if_block = push_block(ctx); 1631} 1632 1633static void 1634trans_else(const struct instr_translater *t, 1635 struct ir3_compile_context *ctx, 1636 struct tgsi_full_instruction *inst) 1637{ 1638 struct ir3_instruction *instr; 1639 1640 pop_block(ctx); 1641 1642 instr = pop_branch(ctx); 1643 1644 compile_assert(ctx, (instr->category == -1) && 1645 (instr->opc == OPC_META_FLOW)); 1646 1647 push_branch(ctx, true, instr, NULL); 1648 instr->flow.else_block = push_block(ctx); 1649} 1650 1651static struct ir3_instruction * 1652find_temporary(struct ir3_block *block, unsigned n) 1653{ 1654 if (block->parent && !block->temporaries[n]) 1655 return find_temporary(block->parent, n); 1656 return block->temporaries[n]; 1657} 1658 1659static struct ir3_instruction * 1660find_output(struct ir3_block *block, unsigned n) 1661{ 1662 if (block->parent && !block->outputs[n]) 1663 return find_output(block->parent, n); 1664 return block->outputs[n]; 1665} 1666 1667static struct ir3_instruction * 1668create_phi(struct ir3_compile_context *ctx, struct ir3_instruction *cond, 1669 struct ir3_instruction *a, struct ir3_instruction *b) 1670{ 1671 struct ir3_instruction *phi; 1672 1673 compile_assert(ctx, cond); 1674 1675 /* Either side of the condition could be null.. which 1676 * indicates a variable written on only one side of the 1677 * branch. Normally this should only be variables not 1678 * used outside of that side of the branch. So we could 1679 * just 'return a ? a : b;' in that case. But for better 1680 * defined undefined behavior we just stick in imm{0.0}. 1681 * In the common case of a value only used within the 1682 * one side of the branch, the PHI instruction will not 1683 * get scheduled 1684 */ 1685 if (!a) 1686 a = create_immed(ctx, 0.0); 1687 if (!b) 1688 b = create_immed(ctx, 0.0); 1689 1690 phi = instr_create(ctx, -1, OPC_META_PHI); 1691 ir3_reg_create(phi, 0, 0); /* dummy dst */ 1692 ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = cond; 1693 ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = a; 1694 ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = b; 1695 1696 return phi; 1697} 1698 1699static void 1700trans_endif(const struct instr_translater *t, 1701 struct ir3_compile_context *ctx, 1702 struct tgsi_full_instruction *inst) 1703{ 1704 struct ir3_instruction *instr; 1705 struct ir3_block *ifb, *elseb; 1706 struct ir3_instruction **ifout, **elseout; 1707 unsigned i, ifnout = 0, elsenout = 0; 1708 1709 pop_block(ctx); 1710 1711 instr = pop_branch(ctx); 1712 1713 compile_assert(ctx, (instr->category == -1) && 1714 (instr->opc == OPC_META_FLOW)); 1715 1716 ifb = instr->flow.if_block; 1717 elseb = instr->flow.else_block; 1718 /* if there is no else block, the parent block is used for the 1719 * branch-not-taken src of the PHI instructions: 1720 */ 1721 if (!elseb) 1722 elseb = ifb->parent; 1723 1724 /* worst case sizes: */ 1725 ifnout = ifb->ntemporaries + ifb->noutputs; 1726 elsenout = elseb->ntemporaries + elseb->noutputs; 1727 1728 ifout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * ifnout); 1729 if (elseb != ifb->parent) 1730 elseout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * elsenout); 1731 1732 ifnout = 0; 1733 elsenout = 0; 1734 1735 /* generate PHI instructions for any temporaries written: */ 1736 for (i = 0; i < ifb->ntemporaries; i++) { 1737 struct ir3_instruction *a = ifb->temporaries[i]; 1738 struct ir3_instruction *b = elseb->temporaries[i]; 1739 1740 /* if temporary written in if-block, or if else block 1741 * is present and temporary written in else-block: 1742 */ 1743 if (a || ((elseb != ifb->parent) && b)) { 1744 struct ir3_instruction *phi; 1745 1746 /* if only written on one side, find the closest 1747 * enclosing update on other side: 1748 */ 1749 if (!a) 1750 a = find_temporary(ifb, i); 1751 if (!b) 1752 b = find_temporary(elseb, i); 1753 1754 ifout[ifnout] = a; 1755 a = create_output(ifb, a, ifnout++); 1756 1757 if (elseb != ifb->parent) { 1758 elseout[elsenout] = b; 1759 b = create_output(elseb, b, elsenout++); 1760 } 1761 1762 phi = create_phi(ctx, instr, a, b); 1763 ctx->block->temporaries[i] = phi; 1764 } 1765 } 1766 1767 compile_assert(ctx, ifb->noutputs == elseb->noutputs); 1768 1769 /* .. and any outputs written: */ 1770 for (i = 0; i < ifb->noutputs; i++) { 1771 struct ir3_instruction *a = ifb->outputs[i]; 1772 struct ir3_instruction *b = elseb->outputs[i]; 1773 1774 /* if output written in if-block, or if else block 1775 * is present and output written in else-block: 1776 */ 1777 if (a || ((elseb != ifb->parent) && b)) { 1778 struct ir3_instruction *phi; 1779 1780 /* if only written on one side, find the closest 1781 * enclosing update on other side: 1782 */ 1783 if (!a) 1784 a = find_output(ifb, i); 1785 if (!b) 1786 b = find_output(elseb, i); 1787 1788 ifout[ifnout] = a; 1789 a = create_output(ifb, a, ifnout++); 1790 1791 if (elseb != ifb->parent) { 1792 elseout[elsenout] = b; 1793 b = create_output(elseb, b, elsenout++); 1794 } 1795 1796 phi = create_phi(ctx, instr, a, b); 1797 ctx->block->outputs[i] = phi; 1798 } 1799 } 1800 1801 ifb->noutputs = ifnout; 1802 ifb->outputs = ifout; 1803 1804 if (elseb != ifb->parent) { 1805 elseb->noutputs = elsenout; 1806 elseb->outputs = elseout; 1807 } 1808 1809 // TODO maybe we want to compact block->inputs? 1810} 1811 1812/* 1813 * Kill 1814 */ 1815 1816static void 1817trans_kill(const struct instr_translater *t, 1818 struct ir3_compile_context *ctx, 1819 struct tgsi_full_instruction *inst) 1820{ 1821 struct ir3_instruction *instr, *immed, *cond = NULL; 1822 bool inv = false; 1823 1824 switch (t->tgsi_opc) { 1825 case TGSI_OPCODE_KILL: 1826 /* unconditional kill, use enclosing if condition: */ 1827 if (ctx->branch_count > 0) { 1828 unsigned int idx = ctx->branch_count - 1; 1829 cond = ctx->branch[idx].cond; 1830 inv = ctx->branch[idx].inv; 1831 } else { 1832 cond = create_immed(ctx, 1.0); 1833 } 1834 1835 break; 1836 } 1837 1838 compile_assert(ctx, cond); 1839 1840 immed = create_immed(ctx, 0.0); 1841 1842 /* cmps.f.ne p0.x, cond, {0.0} */ 1843 instr = instr_create(ctx, 2, OPC_CMPS_F); 1844 instr->cat2.condition = IR3_COND_NE; 1845 ir3_reg_create(instr, regid(REG_P0, 0), 0); 1846 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond; 1847 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed; 1848 cond = instr; 1849 1850 /* kill p0.x */ 1851 instr = instr_create(ctx, 0, OPC_KILL); 1852 instr->cat0.inv = inv; 1853 ir3_reg_create(instr, 0, 0); /* dummy dst */ 1854 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond; 1855 1856 ctx->kill[ctx->kill_count++] = instr; 1857} 1858 1859/* 1860 * Kill-If 1861 */ 1862 1863static void 1864trans_killif(const struct instr_translater *t, 1865 struct ir3_compile_context *ctx, 1866 struct tgsi_full_instruction *inst) 1867{ 1868 struct tgsi_src_register *src = &inst->Src[0].Register; 1869 struct ir3_instruction *instr, *immed, *cond = NULL; 1870 bool inv = false; 1871 1872 immed = create_immed(ctx, 0.0); 1873 1874 /* cmps.f.ne p0.x, cond, {0.0} */ 1875 instr = instr_create(ctx, 2, OPC_CMPS_F); 1876 instr->cat2.condition = IR3_COND_NE; 1877 ir3_reg_create(instr, regid(REG_P0, 0), 0); 1878 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed; 1879 add_src_reg(ctx, instr, src, src->SwizzleX); 1880 1881 cond = instr; 1882 1883 /* kill p0.x */ 1884 instr = instr_create(ctx, 0, OPC_KILL); 1885 instr->cat0.inv = inv; 1886 ir3_reg_create(instr, 0, 0); /* dummy dst */ 1887 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond; 1888 1889 ctx->kill[ctx->kill_count++] = instr; 1890 1891} 1892/* 1893 * I2F / U2F / F2I / F2U 1894 */ 1895 1896static void 1897trans_cov(const struct instr_translater *t, 1898 struct ir3_compile_context *ctx, 1899 struct tgsi_full_instruction *inst) 1900{ 1901 struct ir3_instruction *instr; 1902 struct tgsi_dst_register *dst = get_dst(ctx, inst); 1903 struct tgsi_src_register *src = &inst->Src[0].Register; 1904 1905 // cov.f32s32 dst, tmp0 / 1906 instr = instr_create(ctx, 1, 0); 1907 switch (t->tgsi_opc) { 1908 case TGSI_OPCODE_U2F: 1909 instr->cat1.src_type = TYPE_U32; 1910 instr->cat1.dst_type = TYPE_F32; 1911 break; 1912 case TGSI_OPCODE_I2F: 1913 instr->cat1.src_type = TYPE_S32; 1914 instr->cat1.dst_type = TYPE_F32; 1915 break; 1916 case TGSI_OPCODE_F2U: 1917 instr->cat1.src_type = TYPE_F32; 1918 instr->cat1.dst_type = TYPE_U32; 1919 break; 1920 case TGSI_OPCODE_F2I: 1921 instr->cat1.src_type = TYPE_F32; 1922 instr->cat1.dst_type = TYPE_S32; 1923 break; 1924 1925 } 1926 vectorize(ctx, instr, dst, 1, src, 0); 1927 put_dst(ctx, inst, dst); 1928} 1929 1930/* 1931 * UMUL 1932 * 1933 * There is no 32-bit multiply instruction, so splitting a and b into high and 1934 * low components, we get that 1935 * 1936 * dst = al * bl + ah * bl << 16 + al * bh << 16 1937 * 1938 * mull.u tmp0, a, b (mul low, i.e. al * bl) 1939 * madsh.m16 tmp1, a, b, tmp0 (mul-add shift high mix, i.e. ah * bl << 16) 1940 * madsh.m16 dst, b, a, tmp1 (i.e. al * bh << 16) 1941 */ 1942static void 1943trans_umul(const struct instr_translater *t, 1944 struct ir3_compile_context *ctx, 1945 struct tgsi_full_instruction *inst) 1946{ 1947 struct ir3_instruction *instr; 1948 struct tgsi_dst_register *dst = get_dst(ctx, inst); 1949 struct tgsi_src_register *a = &inst->Src[0].Register; 1950 struct tgsi_src_register *b = &inst->Src[1].Register; 1951 1952 struct tgsi_dst_register tmp0_dst, tmp1_dst; 1953 struct tgsi_src_register *tmp0_src, *tmp1_src; 1954 1955 tmp0_src = get_internal_temp(ctx, &tmp0_dst); 1956 tmp1_src = get_internal_temp(ctx, &tmp1_dst); 1957 1958 if (is_rel_or_const(a)) 1959 a = get_unconst(ctx, a); 1960 if (is_rel_or_const(b)) 1961 b = get_unconst(ctx, b); 1962 1963 /* mull.u tmp0, a, b */ 1964 instr = instr_create(ctx, 2, OPC_MULL_U); 1965 vectorize(ctx, instr, &tmp0_dst, 2, a, 0, b, 0); 1966 1967 /* madsh.m16 tmp1, a, b, tmp0 */ 1968 instr = instr_create(ctx, 3, OPC_MADSH_M16); 1969 vectorize(ctx, instr, &tmp1_dst, 3, a, 0, b, 0, tmp0_src, 0); 1970 1971 /* madsh.m16 dst, b, a, tmp1 */ 1972 instr = instr_create(ctx, 3, OPC_MADSH_M16); 1973 vectorize(ctx, instr, dst, 3, b, 0, a, 0, tmp1_src, 0); 1974 put_dst(ctx, inst, dst); 1975} 1976 1977/* 1978 * Handlers for TGSI instructions which do have 1:1 mapping to native 1979 * instructions: 1980 */ 1981 1982static void 1983instr_cat0(const struct instr_translater *t, 1984 struct ir3_compile_context *ctx, 1985 struct tgsi_full_instruction *inst) 1986{ 1987 instr_create(ctx, 0, t->opc); 1988} 1989 1990static void 1991instr_cat1(const struct instr_translater *t, 1992 struct ir3_compile_context *ctx, 1993 struct tgsi_full_instruction *inst) 1994{ 1995 struct tgsi_dst_register *dst = get_dst(ctx, inst); 1996 struct tgsi_src_register *src = &inst->Src[0].Register; 1997 create_mov(ctx, dst, src); 1998 put_dst(ctx, inst, dst); 1999} 2000 2001static void 2002instr_cat2(const struct instr_translater *t, 2003 struct ir3_compile_context *ctx, 2004 struct tgsi_full_instruction *inst) 2005{ 2006 struct tgsi_dst_register *dst = get_dst(ctx, inst); 2007 struct tgsi_src_register *src0 = &inst->Src[0].Register; 2008 struct tgsi_src_register *src1 = &inst->Src[1].Register; 2009 struct ir3_instruction *instr; 2010 unsigned src0_flags = 0, src1_flags = 0; 2011 2012 switch (t->tgsi_opc) { 2013 case TGSI_OPCODE_ABS: 2014 case TGSI_OPCODE_IABS: 2015 src0_flags = IR3_REG_ABS; 2016 break; 2017 case TGSI_OPCODE_INEG: 2018 src0_flags = IR3_REG_NEGATE; 2019 break; 2020 case TGSI_OPCODE_SUB: 2021 src1_flags = IR3_REG_NEGATE; 2022 break; 2023 } 2024 2025 switch (t->opc) { 2026 case OPC_ABSNEG_F: 2027 case OPC_ABSNEG_S: 2028 case OPC_CLZ_B: 2029 case OPC_CLZ_S: 2030 case OPC_SIGN_F: 2031 case OPC_FLOOR_F: 2032 case OPC_CEIL_F: 2033 case OPC_RNDNE_F: 2034 case OPC_RNDAZ_F: 2035 case OPC_TRUNC_F: 2036 case OPC_NOT_B: 2037 case OPC_BFREV_B: 2038 case OPC_SETRM: 2039 case OPC_CBITS_B: 2040 /* these only have one src reg */ 2041 instr = instr_create(ctx, 2, t->opc); 2042 vectorize(ctx, instr, dst, 1, src0, src0_flags); 2043 break; 2044 default: 2045 if (is_const(src0) && is_const(src1)) 2046 src0 = get_unconst(ctx, src0); 2047 2048 instr = instr_create(ctx, 2, t->opc); 2049 vectorize(ctx, instr, dst, 2, src0, src0_flags, 2050 src1, src1_flags); 2051 break; 2052 } 2053 2054 put_dst(ctx, inst, dst); 2055} 2056 2057static void 2058instr_cat3(const struct instr_translater *t, 2059 struct ir3_compile_context *ctx, 2060 struct tgsi_full_instruction *inst) 2061{ 2062 struct tgsi_dst_register *dst = get_dst(ctx, inst); 2063 struct tgsi_src_register *src0 = &inst->Src[0].Register; 2064 struct tgsi_src_register *src1 = &inst->Src[1].Register; 2065 struct ir3_instruction *instr; 2066 2067 /* in particular, can't handle const for src1 for cat3.. 2068 * for mad, we can swap first two src's if needed: 2069 */ 2070 if (is_rel_or_const(src1)) { 2071 if (is_mad(t->opc) && !is_rel_or_const(src0)) { 2072 struct tgsi_src_register *tmp; 2073 tmp = src0; 2074 src0 = src1; 2075 src1 = tmp; 2076 } else { 2077 src1 = get_unconst(ctx, src1); 2078 } 2079 } 2080 2081 instr = instr_create(ctx, 3, t->opc); 2082 vectorize(ctx, instr, dst, 3, src0, 0, src1, 0, 2083 &inst->Src[2].Register, 0); 2084 put_dst(ctx, inst, dst); 2085} 2086 2087static void 2088instr_cat4(const struct instr_translater *t, 2089 struct ir3_compile_context *ctx, 2090 struct tgsi_full_instruction *inst) 2091{ 2092 struct tgsi_dst_register *dst = get_dst(ctx, inst); 2093 struct tgsi_src_register *src = &inst->Src[0].Register; 2094 struct ir3_instruction *instr; 2095 unsigned i; 2096 2097 /* seems like blob compiler avoids const as src.. */ 2098 if (is_const(src)) 2099 src = get_unconst(ctx, src); 2100 2101 /* we need to replicate into each component: */ 2102 for (i = 0; i < 4; i++) { 2103 if (dst->WriteMask & (1 << i)) { 2104 instr = instr_create(ctx, 4, t->opc); 2105 add_dst_reg(ctx, instr, dst, i); 2106 add_src_reg(ctx, instr, src, src->SwizzleX); 2107 } 2108 } 2109 2110 put_dst(ctx, inst, dst); 2111} 2112 2113static const struct instr_translater translaters[TGSI_OPCODE_LAST] = { 2114#define INSTR(n, f, ...) \ 2115 [TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ } 2116 2117 INSTR(MOV, instr_cat1), 2118 INSTR(RCP, instr_cat4, .opc = OPC_RCP), 2119 INSTR(RSQ, instr_cat4, .opc = OPC_RSQ), 2120 INSTR(SQRT, instr_cat4, .opc = OPC_SQRT), 2121 INSTR(MUL, instr_cat2, .opc = OPC_MUL_F), 2122 INSTR(ADD, instr_cat2, .opc = OPC_ADD_F), 2123 INSTR(SUB, instr_cat2, .opc = OPC_ADD_F), 2124 INSTR(MIN, instr_cat2, .opc = OPC_MIN_F), 2125 INSTR(MAX, instr_cat2, .opc = OPC_MAX_F), 2126 INSTR(UADD, instr_cat2, .opc = OPC_ADD_U), 2127 INSTR(IMIN, instr_cat2, .opc = OPC_MIN_S), 2128 INSTR(UMIN, instr_cat2, .opc = OPC_MIN_U), 2129 INSTR(IMAX, instr_cat2, .opc = OPC_MAX_S), 2130 INSTR(UMAX, instr_cat2, .opc = OPC_MAX_U), 2131 INSTR(AND, instr_cat2, .opc = OPC_AND_B), 2132 INSTR(OR, instr_cat2, .opc = OPC_OR_B), 2133 INSTR(NOT, instr_cat2, .opc = OPC_NOT_B), 2134 INSTR(XOR, instr_cat2, .opc = OPC_XOR_B), 2135 INSTR(UMUL, trans_umul), 2136 INSTR(SHL, instr_cat2, .opc = OPC_SHL_B), 2137 INSTR(USHR, instr_cat2, .opc = OPC_SHR_B), 2138 INSTR(ISHR, instr_cat2, .opc = OPC_ASHR_B), 2139 INSTR(IABS, instr_cat2, .opc = OPC_ABSNEG_S), 2140 INSTR(INEG, instr_cat2, .opc = OPC_ABSNEG_S), 2141 INSTR(AND, instr_cat2, .opc = OPC_AND_B), 2142 INSTR(MAD, instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16), 2143 INSTR(TRUNC, instr_cat2, .opc = OPC_TRUNC_F), 2144 INSTR(CLAMP, trans_clamp), 2145 INSTR(FLR, instr_cat2, .opc = OPC_FLOOR_F), 2146 INSTR(ROUND, instr_cat2, .opc = OPC_RNDNE_F), 2147 INSTR(SSG, instr_cat2, .opc = OPC_SIGN_F), 2148 INSTR(CEIL, instr_cat2, .opc = OPC_CEIL_F), 2149 INSTR(ARL, trans_arl), 2150 INSTR(UARL, trans_arl), 2151 INSTR(EX2, instr_cat4, .opc = OPC_EXP2), 2152 INSTR(LG2, instr_cat4, .opc = OPC_LOG2), 2153 INSTR(ABS, instr_cat2, .opc = OPC_ABSNEG_F), 2154 INSTR(COS, instr_cat4, .opc = OPC_COS), 2155 INSTR(SIN, instr_cat4, .opc = OPC_SIN), 2156 INSTR(TEX, trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TEX), 2157 INSTR(TXP, trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TXP), 2158 INSTR(TXB, trans_samp, .opc = OPC_SAMB, .arg = TGSI_OPCODE_TXB), 2159 INSTR(TXB2, trans_samp, .opc = OPC_SAMB, .arg = TGSI_OPCODE_TXB2), 2160 INSTR(TXL, trans_samp, .opc = OPC_SAML, .arg = TGSI_OPCODE_TXL), 2161 INSTR(TXQ, trans_txq), 2162 INSTR(DDX, trans_deriv, .opc = OPC_DSX), 2163 INSTR(DDY, trans_deriv, .opc = OPC_DSY), 2164 INSTR(SGT, trans_cmp), 2165 INSTR(SLT, trans_cmp), 2166 INSTR(FSLT, trans_cmp), 2167 INSTR(SGE, trans_cmp), 2168 INSTR(FSGE, trans_cmp), 2169 INSTR(SLE, trans_cmp), 2170 INSTR(SNE, trans_cmp), 2171 INSTR(FSNE, trans_cmp), 2172 INSTR(SEQ, trans_cmp), 2173 INSTR(FSEQ, trans_cmp), 2174 INSTR(CMP, trans_cmp), 2175 INSTR(USNE, trans_icmp, .opc = OPC_CMPS_U), 2176 INSTR(USEQ, trans_icmp, .opc = OPC_CMPS_U), 2177 INSTR(ISGE, trans_icmp, .opc = OPC_CMPS_S), 2178 INSTR(USGE, trans_icmp, .opc = OPC_CMPS_U), 2179 INSTR(ISLT, trans_icmp, .opc = OPC_CMPS_S), 2180 INSTR(USLT, trans_icmp, .opc = OPC_CMPS_U), 2181 INSTR(UCMP, trans_ucmp), 2182 INSTR(IF, trans_if, .opc = OPC_CMPS_F), 2183 INSTR(UIF, trans_if, .opc = OPC_CMPS_U), 2184 INSTR(ELSE, trans_else), 2185 INSTR(ENDIF, trans_endif), 2186 INSTR(END, instr_cat0, .opc = OPC_END), 2187 INSTR(KILL, trans_kill, .opc = OPC_KILL), 2188 INSTR(KILL_IF, trans_killif, .opc = OPC_KILL), 2189 INSTR(I2F, trans_cov), 2190 INSTR(U2F, trans_cov), 2191 INSTR(F2I, trans_cov), 2192 INSTR(F2U, trans_cov), 2193}; 2194 2195static ir3_semantic 2196decl_semantic(const struct tgsi_declaration_semantic *sem) 2197{ 2198 return ir3_semantic_name(sem->Name, sem->Index); 2199} 2200 2201static struct ir3_instruction * 2202decl_in_frag_bary(struct ir3_compile_context *ctx, unsigned regid, 2203 unsigned j, unsigned inloc) 2204{ 2205 struct ir3_instruction *instr; 2206 struct ir3_register *src; 2207 2208 /* bary.f dst, #inloc, r0.x */ 2209 instr = instr_create(ctx, 2, OPC_BARY_F); 2210 ir3_reg_create(instr, regid, 0); /* dummy dst */ 2211 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = inloc; 2212 src = ir3_reg_create(instr, 0, IR3_REG_SSA); 2213 src->wrmask = 0x3; 2214 src->instr = ctx->frag_pos; 2215 2216 return instr; 2217} 2218 2219/* TGSI_SEMANTIC_POSITION 2220 * """""""""""""""""""""" 2221 * 2222 * For fragment shaders, TGSI_SEMANTIC_POSITION is used to indicate that 2223 * fragment shader input contains the fragment's window position. The X 2224 * component starts at zero and always increases from left to right. 2225 * The Y component starts at zero and always increases but Y=0 may either 2226 * indicate the top of the window or the bottom depending on the fragment 2227 * coordinate origin convention (see TGSI_PROPERTY_FS_COORD_ORIGIN). 2228 * The Z coordinate ranges from 0 to 1 to represent depth from the front 2229 * to the back of the Z buffer. The W component contains the reciprocol 2230 * of the interpolated vertex position W component. 2231 */ 2232static struct ir3_instruction * 2233decl_in_frag_coord(struct ir3_compile_context *ctx, unsigned regid, 2234 unsigned j) 2235{ 2236 struct ir3_instruction *instr, *src; 2237 2238 compile_assert(ctx, !ctx->frag_coord[j]); 2239 2240 ctx->frag_coord[j] = create_input(ctx->block, NULL, 0); 2241 2242 2243 switch (j) { 2244 case 0: /* .x */ 2245 case 1: /* .y */ 2246 /* for frag_coord, we get unsigned values.. we need 2247 * to subtract (integer) 8 and divide by 16 (right- 2248 * shift by 4) then convert to float: 2249 */ 2250 2251 /* add.s tmp, src, -8 */ 2252 instr = instr_create(ctx, 2, OPC_ADD_S); 2253 ir3_reg_create(instr, regid, 0); /* dummy dst */ 2254 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_coord[j]; 2255 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = -8; 2256 src = instr; 2257 2258 /* shr.b tmp, tmp, 4 */ 2259 instr = instr_create(ctx, 2, OPC_SHR_B); 2260 ir3_reg_create(instr, regid, 0); /* dummy dst */ 2261 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; 2262 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4; 2263 src = instr; 2264 2265 /* mov.u32f32 dst, tmp */ 2266 instr = instr_create(ctx, 1, 0); 2267 instr->cat1.src_type = TYPE_U32; 2268 instr->cat1.dst_type = TYPE_F32; 2269 ir3_reg_create(instr, regid, 0); /* dummy dst */ 2270 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; 2271 2272 break; 2273 case 2: /* .z */ 2274 case 3: /* .w */ 2275 /* seems that we can use these as-is: */ 2276 instr = ctx->frag_coord[j]; 2277 break; 2278 default: 2279 compile_error(ctx, "invalid channel\n"); 2280 instr = create_immed(ctx, 0.0); 2281 break; 2282 } 2283 2284 return instr; 2285} 2286 2287/* TGSI_SEMANTIC_FACE 2288 * """""""""""""""""" 2289 * 2290 * This label applies to fragment shader inputs only and indicates that 2291 * the register contains front/back-face information of the form (F, 0, 2292 * 0, 1). The first component will be positive when the fragment belongs 2293 * to a front-facing polygon, and negative when the fragment belongs to a 2294 * back-facing polygon. 2295 */ 2296static struct ir3_instruction * 2297decl_in_frag_face(struct ir3_compile_context *ctx, unsigned regid, 2298 unsigned j) 2299{ 2300 struct ir3_instruction *instr, *src; 2301 2302 switch (j) { 2303 case 0: /* .x */ 2304 compile_assert(ctx, !ctx->frag_face); 2305 2306 ctx->frag_face = create_input(ctx->block, NULL, 0); 2307 2308 /* for faceness, we always get -1 or 0 (int).. but TGSI expects 2309 * positive vs negative float.. and piglit further seems to 2310 * expect -1.0 or 1.0: 2311 * 2312 * mul.s tmp, hr0.x, 2 2313 * add.s tmp, tmp, 1 2314 * mov.s16f32, dst, tmp 2315 * 2316 */ 2317 2318 instr = instr_create(ctx, 2, OPC_MUL_S); 2319 ir3_reg_create(instr, regid, 0); /* dummy dst */ 2320 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_face; 2321 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2; 2322 src = instr; 2323 2324 instr = instr_create(ctx, 2, OPC_ADD_S); 2325 ir3_reg_create(instr, regid, 0); /* dummy dst */ 2326 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; 2327 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1; 2328 src = instr; 2329 2330 instr = instr_create(ctx, 1, 0); /* mov */ 2331 instr->cat1.src_type = TYPE_S32; 2332 instr->cat1.dst_type = TYPE_F32; 2333 ir3_reg_create(instr, regid, 0); /* dummy dst */ 2334 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; 2335 2336 break; 2337 case 1: /* .y */ 2338 case 2: /* .z */ 2339 instr = create_immed(ctx, 0.0); 2340 break; 2341 case 3: /* .w */ 2342 instr = create_immed(ctx, 1.0); 2343 break; 2344 default: 2345 compile_error(ctx, "invalid channel\n"); 2346 instr = create_immed(ctx, 0.0); 2347 break; 2348 } 2349 2350 return instr; 2351} 2352 2353static void 2354decl_in(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl) 2355{ 2356 struct ir3_shader_variant *so = ctx->so; 2357 unsigned name = decl->Semantic.Name; 2358 unsigned i; 2359 2360 /* I don't think we should get frag shader input without 2361 * semantic info? Otherwise how do inputs get linked to 2362 * vert outputs? 2363 */ 2364 compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) || 2365 decl->Declaration.Semantic); 2366 2367 for (i = decl->Range.First; i <= decl->Range.Last; i++) { 2368 unsigned n = so->inputs_count++; 2369 unsigned r = regid(i, 0); 2370 unsigned ncomp, j; 2371 2372 /* we'll figure out the actual components used after scheduling */ 2373 ncomp = 4; 2374 2375 DBG("decl in -> r%d", i); 2376 2377 compile_assert(ctx, n < ARRAY_SIZE(so->inputs)); 2378 2379 so->inputs[n].semantic = decl_semantic(&decl->Semantic); 2380 so->inputs[n].compmask = (1 << ncomp) - 1; 2381 so->inputs[n].regid = r; 2382 so->inputs[n].inloc = ctx->next_inloc; 2383 so->inputs[n].interpolate = decl->Interp.Interpolate; 2384 2385 for (j = 0; j < ncomp; j++) { 2386 struct ir3_instruction *instr = NULL; 2387 2388 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { 2389 /* for fragment shaders, POSITION and FACE are handled 2390 * specially, not using normal varying / bary.f 2391 */ 2392 if (name == TGSI_SEMANTIC_POSITION) { 2393 so->inputs[n].bary = false; 2394 so->frag_coord = true; 2395 instr = decl_in_frag_coord(ctx, r + j, j); 2396 } else if (name == TGSI_SEMANTIC_FACE) { 2397 so->inputs[n].bary = false; 2398 so->frag_face = true; 2399 instr = decl_in_frag_face(ctx, r + j, j); 2400 } else { 2401 so->inputs[n].bary = true; 2402 instr = decl_in_frag_bary(ctx, r + j, j, 2403 so->inputs[n].inloc + j - 8); 2404 } 2405 } else { 2406 instr = create_input(ctx->block, NULL, (i * 4) + j); 2407 } 2408 2409 ctx->block->inputs[(i * 4) + j] = instr; 2410 } 2411 2412 if (so->inputs[n].bary || (ctx->type == TGSI_PROCESSOR_VERTEX)) { 2413 ctx->next_inloc += ncomp; 2414 so->total_in += ncomp; 2415 } 2416 } 2417} 2418 2419static void 2420decl_out(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl) 2421{ 2422 struct ir3_shader_variant *so = ctx->so; 2423 unsigned comp = 0; 2424 unsigned name = decl->Semantic.Name; 2425 unsigned i; 2426 2427 compile_assert(ctx, decl->Declaration.Semantic); 2428 2429 DBG("decl out[%d] -> r%d", name, decl->Range.First); 2430 2431 if (ctx->type == TGSI_PROCESSOR_VERTEX) { 2432 switch (name) { 2433 case TGSI_SEMANTIC_POSITION: 2434 so->writes_pos = true; 2435 break; 2436 case TGSI_SEMANTIC_PSIZE: 2437 so->writes_psize = true; 2438 break; 2439 case TGSI_SEMANTIC_COLOR: 2440 case TGSI_SEMANTIC_BCOLOR: 2441 case TGSI_SEMANTIC_GENERIC: 2442 case TGSI_SEMANTIC_FOG: 2443 case TGSI_SEMANTIC_TEXCOORD: 2444 break; 2445 default: 2446 compile_error(ctx, "unknown VS semantic name: %s\n", 2447 tgsi_semantic_names[name]); 2448 } 2449 } else { 2450 switch (name) { 2451 case TGSI_SEMANTIC_POSITION: 2452 comp = 2; /* tgsi will write to .z component */ 2453 so->writes_pos = true; 2454 break; 2455 case TGSI_SEMANTIC_COLOR: 2456 break; 2457 default: 2458 compile_error(ctx, "unknown FS semantic name: %s\n", 2459 tgsi_semantic_names[name]); 2460 } 2461 } 2462 2463 for (i = decl->Range.First; i <= decl->Range.Last; i++) { 2464 unsigned n = so->outputs_count++; 2465 unsigned ncomp, j; 2466 2467 ncomp = 4; 2468 2469 compile_assert(ctx, n < ARRAY_SIZE(so->outputs)); 2470 2471 so->outputs[n].semantic = decl_semantic(&decl->Semantic); 2472 so->outputs[n].regid = regid(i, comp); 2473 2474 /* avoid undefined outputs, stick a dummy mov from imm{0.0}, 2475 * which if the output is actually assigned will be over- 2476 * written 2477 */ 2478 for (j = 0; j < ncomp; j++) 2479 ctx->block->outputs[(i * 4) + j] = create_immed(ctx, 0.0); 2480 } 2481} 2482 2483/* from TGSI perspective, we actually have inputs. But most of the "inputs" 2484 * for a fragment shader are just bary.f instructions. The *actual* inputs 2485 * from the hw perspective are the frag_pos and optionally frag_coord and 2486 * frag_face. 2487 */ 2488static void 2489fixup_frag_inputs(struct ir3_compile_context *ctx) 2490{ 2491 struct ir3_shader_variant *so = ctx->so; 2492 struct ir3_block *block = ctx->block; 2493 struct ir3_instruction **inputs; 2494 struct ir3_instruction *instr; 2495 int n, regid = 0; 2496 2497 block->ninputs = 0; 2498 2499 n = 4; /* always have frag_pos */ 2500 n += COND(so->frag_face, 4); 2501 n += COND(so->frag_coord, 4); 2502 2503 inputs = ir3_alloc(ctx->ir, n * (sizeof(struct ir3_instruction *))); 2504 2505 if (so->frag_face) { 2506 /* this ultimately gets assigned to hr0.x so doesn't conflict 2507 * with frag_coord/frag_pos.. 2508 */ 2509 inputs[block->ninputs++] = ctx->frag_face; 2510 ctx->frag_face->regs[0]->num = 0; 2511 2512 /* remaining channels not used, but let's avoid confusing 2513 * other parts that expect inputs to come in groups of vec4 2514 */ 2515 inputs[block->ninputs++] = NULL; 2516 inputs[block->ninputs++] = NULL; 2517 inputs[block->ninputs++] = NULL; 2518 } 2519 2520 /* since we don't know where to set the regid for frag_coord, 2521 * we have to use r0.x for it. But we don't want to *always* 2522 * use r1.x for frag_pos as that could increase the register 2523 * footprint on simple shaders: 2524 */ 2525 if (so->frag_coord) { 2526 ctx->frag_coord[0]->regs[0]->num = regid++; 2527 ctx->frag_coord[1]->regs[0]->num = regid++; 2528 ctx->frag_coord[2]->regs[0]->num = regid++; 2529 ctx->frag_coord[3]->regs[0]->num = regid++; 2530 2531 inputs[block->ninputs++] = ctx->frag_coord[0]; 2532 inputs[block->ninputs++] = ctx->frag_coord[1]; 2533 inputs[block->ninputs++] = ctx->frag_coord[2]; 2534 inputs[block->ninputs++] = ctx->frag_coord[3]; 2535 } 2536 2537 /* we always have frag_pos: */ 2538 so->pos_regid = regid; 2539 2540 /* r0.x */ 2541 instr = create_input(block, NULL, block->ninputs); 2542 instr->regs[0]->num = regid++; 2543 inputs[block->ninputs++] = instr; 2544 ctx->frag_pos->regs[1]->instr = instr; 2545 2546 /* r0.y */ 2547 instr = create_input(block, NULL, block->ninputs); 2548 instr->regs[0]->num = regid++; 2549 inputs[block->ninputs++] = instr; 2550 ctx->frag_pos->regs[2]->instr = instr; 2551 2552 block->inputs = inputs; 2553} 2554 2555static void 2556compile_instructions(struct ir3_compile_context *ctx) 2557{ 2558 push_block(ctx); 2559 2560 /* for fragment shader, we have a single input register (usually 2561 * r0.xy) which is used as the base for bary.f varying fetch instrs: 2562 */ 2563 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { 2564 struct ir3_instruction *instr; 2565 instr = ir3_instr_create(ctx->block, -1, OPC_META_FI); 2566 ir3_reg_create(instr, 0, 0); 2567 ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.x */ 2568 ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.y */ 2569 ctx->frag_pos = instr; 2570 } 2571 2572 while (!tgsi_parse_end_of_tokens(&ctx->parser)) { 2573 tgsi_parse_token(&ctx->parser); 2574 2575 switch (ctx->parser.FullToken.Token.Type) { 2576 case TGSI_TOKEN_TYPE_DECLARATION: { 2577 struct tgsi_full_declaration *decl = 2578 &ctx->parser.FullToken.FullDeclaration; 2579 if (decl->Declaration.File == TGSI_FILE_OUTPUT) { 2580 decl_out(ctx, decl); 2581 } else if (decl->Declaration.File == TGSI_FILE_INPUT) { 2582 decl_in(ctx, decl); 2583 } 2584 break; 2585 } 2586 case TGSI_TOKEN_TYPE_IMMEDIATE: { 2587 /* TODO: if we know the immediate is small enough, and only 2588 * used with instructions that can embed an immediate, we 2589 * can skip this: 2590 */ 2591 struct tgsi_full_immediate *imm = 2592 &ctx->parser.FullToken.FullImmediate; 2593 unsigned n = ctx->so->immediates_count++; 2594 compile_assert(ctx, n < ARRAY_SIZE(ctx->so->immediates)); 2595 memcpy(ctx->so->immediates[n].val, imm->u, 16); 2596 break; 2597 } 2598 case TGSI_TOKEN_TYPE_INSTRUCTION: { 2599 struct tgsi_full_instruction *inst = 2600 &ctx->parser.FullToken.FullInstruction; 2601 unsigned opc = inst->Instruction.Opcode; 2602 const struct instr_translater *t = &translaters[opc]; 2603 2604 if (t->fxn) { 2605 t->fxn(t, ctx, inst); 2606 ctx->num_internal_temps = 0; 2607 2608 compile_assert(ctx, !ctx->using_tmp_dst); 2609 } else { 2610 compile_error(ctx, "unknown TGSI opc: %s\n", 2611 tgsi_get_opcode_name(opc)); 2612 } 2613 2614 switch (inst->Instruction.Saturate) { 2615 case TGSI_SAT_ZERO_ONE: 2616 create_clamp_imm(ctx, &inst->Dst[0].Register, 2617 fui(0.0), fui(1.0)); 2618 break; 2619 case TGSI_SAT_MINUS_PLUS_ONE: 2620 create_clamp_imm(ctx, &inst->Dst[0].Register, 2621 fui(-1.0), fui(1.0)); 2622 break; 2623 } 2624 2625 instr_finish(ctx); 2626 2627 break; 2628 } 2629 default: 2630 break; 2631 } 2632 } 2633} 2634 2635static void 2636compile_dump(struct ir3_compile_context *ctx) 2637{ 2638 const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag"; 2639 static unsigned n = 0; 2640 char fname[16]; 2641 FILE *f; 2642 snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++); 2643 f = fopen(fname, "w"); 2644 if (!f) 2645 return; 2646 ir3_block_depth(ctx->block); 2647 ir3_dump(ctx->ir, name, ctx->block, f); 2648 fclose(f); 2649} 2650 2651int 2652ir3_compile_shader(struct ir3_shader_variant *so, 2653 const struct tgsi_token *tokens, struct ir3_shader_key key, 2654 bool cp) 2655{ 2656 struct ir3_compile_context ctx; 2657 struct ir3_block *block; 2658 struct ir3_instruction **inputs; 2659 unsigned i, j, actual_in; 2660 int ret = 0; 2661 2662 assert(!so->ir); 2663 2664 so->ir = ir3_create(); 2665 2666 assert(so->ir); 2667 2668 if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK) { 2669 DBG("INIT failed!"); 2670 ret = -1; 2671 goto out; 2672 } 2673 2674 compile_instructions(&ctx); 2675 2676 block = ctx.block; 2677 2678 /* keep track of the inputs from TGSI perspective.. */ 2679 inputs = block->inputs; 2680 2681 /* but fixup actual inputs for frag shader: */ 2682 if (ctx.type == TGSI_PROCESSOR_FRAGMENT) 2683 fixup_frag_inputs(&ctx); 2684 2685 /* at this point, for binning pass, throw away unneeded outputs: */ 2686 if (key.binning_pass) { 2687 for (i = 0, j = 0; i < so->outputs_count; i++) { 2688 unsigned name = sem2name(so->outputs[i].semantic); 2689 unsigned idx = sem2name(so->outputs[i].semantic); 2690 2691 /* throw away everything but first position/psize */ 2692 if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) || 2693 (name == TGSI_SEMANTIC_PSIZE))) { 2694 if (i != j) { 2695 so->outputs[j] = so->outputs[i]; 2696 block->outputs[(j*4)+0] = block->outputs[(i*4)+0]; 2697 block->outputs[(j*4)+1] = block->outputs[(i*4)+1]; 2698 block->outputs[(j*4)+2] = block->outputs[(i*4)+2]; 2699 block->outputs[(j*4)+3] = block->outputs[(i*4)+3]; 2700 } 2701 j++; 2702 } 2703 } 2704 so->outputs_count = j; 2705 block->noutputs = j * 4; 2706 } 2707 2708 /* for rendering to alpha format, we only need the .w component, 2709 * and we need it to be in the .x position: 2710 */ 2711 if (key.alpha) { 2712 for (i = 0, j = 0; i < so->outputs_count; i++) { 2713 unsigned name = sem2name(so->outputs[i].semantic); 2714 2715 /* move .w component to .x and discard others: */ 2716 if (name == TGSI_SEMANTIC_COLOR) { 2717 block->outputs[(i*4)+0] = block->outputs[(i*4)+3]; 2718 block->outputs[(i*4)+1] = NULL; 2719 block->outputs[(i*4)+2] = NULL; 2720 block->outputs[(i*4)+3] = NULL; 2721 } 2722 } 2723 } 2724 2725 /* at this point, we want the kill's in the outputs array too, 2726 * so that they get scheduled (since they have no dst).. we've 2727 * already ensured that the array is big enough in push_block(): 2728 */ 2729 if (ctx.type == TGSI_PROCESSOR_FRAGMENT) { 2730 for (i = 0; i < ctx.kill_count; i++) 2731 block->outputs[block->noutputs++] = ctx.kill[i]; 2732 } 2733 2734 if (fd_mesa_debug & FD_DBG_OPTDUMP) 2735 compile_dump(&ctx); 2736 2737 ret = ir3_block_flatten(block); 2738 if (ret < 0) { 2739 DBG("FLATTEN failed!"); 2740 goto out; 2741 } 2742 if ((ret > 0) && (fd_mesa_debug & FD_DBG_OPTDUMP)) 2743 compile_dump(&ctx); 2744 2745 if (fd_mesa_debug & FD_DBG_OPTMSGS) { 2746 printf("BEFORE CP:\n"); 2747 ir3_dump_instr_list(block->head); 2748 } 2749 2750 if (cp) 2751 ir3_block_cp(block); 2752 2753 if (fd_mesa_debug & FD_DBG_OPTDUMP) 2754 compile_dump(&ctx); 2755 2756 ir3_block_depth(block); 2757 2758 if (fd_mesa_debug & FD_DBG_OPTMSGS) { 2759 printf("AFTER DEPTH:\n"); 2760 ir3_dump_instr_list(block->head); 2761 } 2762 2763 ret = ir3_block_sched(block); 2764 if (ret) { 2765 DBG("SCHED failed!"); 2766 goto out; 2767 } 2768 2769 if (fd_mesa_debug & FD_DBG_OPTMSGS) { 2770 printf("AFTER SCHED:\n"); 2771 ir3_dump_instr_list(block->head); 2772 } 2773 2774 ret = ir3_block_ra(block, so->type, key.half_precision, 2775 so->frag_coord, so->frag_face, &so->has_samp); 2776 if (ret) { 2777 DBG("RA failed!"); 2778 goto out; 2779 } 2780 2781 if (fd_mesa_debug & FD_DBG_OPTMSGS) { 2782 printf("AFTER RA:\n"); 2783 ir3_dump_instr_list(block->head); 2784 } 2785 2786 /* fixup input/outputs: */ 2787 for (i = 0; i < so->outputs_count; i++) { 2788 so->outputs[i].regid = block->outputs[i*4]->regs[0]->num; 2789 /* preserve hack for depth output.. tgsi writes depth to .z, 2790 * but what we give the hw is the scalar register: 2791 */ 2792 if ((ctx.type == TGSI_PROCESSOR_FRAGMENT) && 2793 (sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION)) 2794 so->outputs[i].regid += 2; 2795 } 2796 /* Note that some or all channels of an input may be unused: */ 2797 actual_in = 0; 2798 for (i = 0; i < so->inputs_count; i++) { 2799 unsigned j, regid = ~0, compmask = 0; 2800 so->inputs[i].ncomp = 0; 2801 for (j = 0; j < 4; j++) { 2802 struct ir3_instruction *in = inputs[(i*4) + j]; 2803 if (in) { 2804 compmask |= (1 << j); 2805 regid = in->regs[0]->num - j; 2806 actual_in++; 2807 so->inputs[i].ncomp++; 2808 } 2809 } 2810 so->inputs[i].regid = regid; 2811 so->inputs[i].compmask = compmask; 2812 } 2813 2814 /* fragment shader always gets full vec4's even if it doesn't 2815 * fetch all components, but vertex shader we need to update 2816 * with the actual number of components fetch, otherwise thing 2817 * will hang due to mismaptch between VFD_DECODE's and 2818 * TOTALATTRTOVS 2819 */ 2820 if (so->type == SHADER_VERTEX) 2821 so->total_in = actual_in; 2822 2823out: 2824 if (ret) { 2825 ir3_destroy(so->ir); 2826 so->ir = NULL; 2827 } 2828 compile_free(&ctx); 2829 2830 return ret; 2831} 2832