ir3_compiler.c revision 547182977f5d893334cb630b974136c05a9461ab
1/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ 2 3/* 4 * Copyright (C) 2013 Rob Clark <robclark@freedesktop.org> 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice (including the next 14 * paragraph) shall be included in all copies or substantial portions of the 15 * Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 * SOFTWARE. 24 * 25 * Authors: 26 * Rob Clark <robclark@freedesktop.org> 27 */ 28 29#include <stdarg.h> 30 31#include "pipe/p_state.h" 32#include "util/u_string.h" 33#include "util/u_memory.h" 34#include "util/u_inlines.h" 35#include "tgsi/tgsi_lowering.h" 36#include "tgsi/tgsi_parse.h" 37#include "tgsi/tgsi_ureg.h" 38#include "tgsi/tgsi_info.h" 39#include "tgsi/tgsi_strings.h" 40#include "tgsi/tgsi_dump.h" 41#include "tgsi/tgsi_scan.h" 42 43#include "freedreno_util.h" 44 45#include "ir3_compiler.h" 46#include "ir3_shader.h" 47 48#include "instr-a3xx.h" 49#include "ir3.h" 50 51struct ir3_compile_context { 52 const struct tgsi_token *tokens; 53 bool free_tokens; 54 struct ir3 *ir; 55 struct ir3_shader_variant *so; 56 57 struct ir3_block *block; 58 struct ir3_instruction *current_instr; 59 60 /* we need to defer updates to block->outputs[] until the end 61 * of an instruction (so we don't see new value until *after* 62 * the src registers are processed) 63 */ 64 struct { 65 struct ir3_instruction *instr, **instrp; 66 } output_updates[16]; 67 unsigned num_output_updates; 68 69 /* are we in a sequence of "atomic" instructions? 70 */ 71 bool atomic; 72 73 /* For fragment shaders, from the hw perspective the only 74 * actual input is r0.xy position register passed to bary.f. 75 * But TGSI doesn't know that, it still declares things as 76 * IN[] registers. So we do all the input tracking normally 77 * and fix things up after compile_instructions() 78 * 79 * NOTE that frag_pos is the hardware position (possibly it 80 * is actually an index or tag or some such.. it is *not* 81 * values that can be directly used for gl_FragCoord..) 82 */ 83 struct ir3_instruction *frag_pos, *frag_face, *frag_coord[4]; 84 85 struct tgsi_parse_context parser; 86 unsigned type; 87 88 struct tgsi_shader_info info; 89 90 /* for calculating input/output positions/linkages: */ 91 unsigned next_inloc; 92 93 unsigned num_internal_temps; 94 struct tgsi_src_register internal_temps[8]; 95 96 /* idx/slot for last compiler generated immediate */ 97 unsigned immediate_idx; 98 99 /* stack of branch instructions that mark (potentially nested) 100 * branch if/else/loop/etc 101 */ 102 struct { 103 struct ir3_instruction *instr, *cond; 104 bool inv; /* true iff in else leg of branch */ 105 } branch[16]; 106 unsigned int branch_count; 107 108 /* list of kill instructions: */ 109 struct ir3_instruction *kill[16]; 110 unsigned int kill_count; 111 112 /* used when dst is same as one of the src, to avoid overwriting a 113 * src element before the remaining scalar instructions that make 114 * up the vector operation 115 */ 116 struct tgsi_dst_register tmp_dst; 117 struct tgsi_src_register *tmp_src; 118 119 /* just for catching incorrect use of get_dst()/put_dst(): 120 */ 121 bool using_tmp_dst; 122}; 123 124 125static void vectorize(struct ir3_compile_context *ctx, 126 struct ir3_instruction *instr, struct tgsi_dst_register *dst, 127 int nsrcs, ...); 128static void create_mov(struct ir3_compile_context *ctx, 129 struct tgsi_dst_register *dst, struct tgsi_src_register *src); 130static type_t get_ftype(struct ir3_compile_context *ctx); 131 132static unsigned 133compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so, 134 const struct tgsi_token *tokens) 135{ 136 unsigned ret; 137 struct tgsi_shader_info *info = &ctx->info; 138 struct tgsi_lowering_config lconfig = { 139 .color_two_side = so->key.color_two_side, 140 .lower_DST = true, 141 .lower_XPD = true, 142 .lower_SCS = true, 143 .lower_LRP = true, 144 .lower_FRC = true, 145 .lower_POW = true, 146 .lower_LIT = true, 147 .lower_EXP = true, 148 .lower_LOG = true, 149 .lower_DP4 = true, 150 .lower_DP3 = true, 151 .lower_DPH = true, 152 .lower_DP2 = true, 153 .lower_DP2A = true, 154 }; 155 156 switch (so->type) { 157 case SHADER_FRAGMENT: 158 case SHADER_COMPUTE: 159 lconfig.saturate_s = so->key.fsaturate_s; 160 lconfig.saturate_t = so->key.fsaturate_t; 161 lconfig.saturate_r = so->key.fsaturate_r; 162 break; 163 case SHADER_VERTEX: 164 lconfig.saturate_s = so->key.vsaturate_s; 165 lconfig.saturate_t = so->key.vsaturate_t; 166 lconfig.saturate_r = so->key.vsaturate_r; 167 break; 168 } 169 170 ctx->tokens = tgsi_transform_lowering(&lconfig, tokens, &ctx->info); 171 ctx->free_tokens = !!ctx->tokens; 172 if (!ctx->tokens) { 173 /* no lowering */ 174 ctx->tokens = tokens; 175 } 176 ctx->ir = so->ir; 177 ctx->so = so; 178 ctx->next_inloc = 8; 179 ctx->num_internal_temps = 0; 180 ctx->branch_count = 0; 181 ctx->kill_count = 0; 182 ctx->block = NULL; 183 ctx->current_instr = NULL; 184 ctx->num_output_updates = 0; 185 ctx->atomic = false; 186 ctx->frag_pos = NULL; 187 ctx->frag_face = NULL; 188 ctx->tmp_src = NULL; 189 ctx->using_tmp_dst = false; 190 191 memset(ctx->frag_coord, 0, sizeof(ctx->frag_coord)); 192 193#define FM(x) (1 << TGSI_FILE_##x) 194 /* optimize can't deal with relative addressing: */ 195 if (info->indirect_files & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT))) 196 return TGSI_PARSE_ERROR; 197 198 /* NOTE: if relative addressing is used, we set constlen in 199 * the compiler (to worst-case value) since we don't know in 200 * the assembler what the max addr reg value can be: 201 */ 202 if (info->indirect_files & FM(CONSTANT)) 203 so->constlen = 4 * (ctx->info.file_max[TGSI_FILE_CONSTANT] + 1); 204 205 /* Immediates go after constants: */ 206 so->first_immediate = info->file_max[TGSI_FILE_CONSTANT] + 1; 207 ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1); 208 209 ret = tgsi_parse_init(&ctx->parser, ctx->tokens); 210 if (ret != TGSI_PARSE_OK) 211 return ret; 212 213 ctx->type = ctx->parser.FullHeader.Processor.Processor; 214 215 return ret; 216} 217 218static void 219compile_error(struct ir3_compile_context *ctx, const char *format, ...) 220{ 221 va_list ap; 222 va_start(ap, format); 223 _debug_vprintf(format, ap); 224 va_end(ap); 225 tgsi_dump(ctx->tokens, 0); 226 debug_assert(0); 227} 228 229#define compile_assert(ctx, cond) do { \ 230 if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \ 231 } while (0) 232 233static void 234compile_free(struct ir3_compile_context *ctx) 235{ 236 if (ctx->free_tokens) 237 free((void *)ctx->tokens); 238 tgsi_parse_free(&ctx->parser); 239} 240 241struct instr_translater { 242 void (*fxn)(const struct instr_translater *t, 243 struct ir3_compile_context *ctx, 244 struct tgsi_full_instruction *inst); 245 unsigned tgsi_opc; 246 opc_t opc; 247 opc_t hopc; /* opc to use for half_precision mode, if different */ 248 unsigned arg; 249}; 250 251static void 252instr_finish(struct ir3_compile_context *ctx) 253{ 254 unsigned i; 255 256 if (ctx->atomic) 257 return; 258 259 for (i = 0; i < ctx->num_output_updates; i++) 260 *(ctx->output_updates[i].instrp) = ctx->output_updates[i].instr; 261 262 ctx->num_output_updates = 0; 263} 264 265/* For "atomic" groups of instructions, for example the four scalar 266 * instructions to perform a vec4 operation. Basically this just 267 * blocks out handling of output_updates so the next scalar instruction 268 * still sees the result from before the start of the atomic group. 269 * 270 * NOTE: when used properly, this could probably replace get/put_dst() 271 * stuff. 272 */ 273static void 274instr_atomic_start(struct ir3_compile_context *ctx) 275{ 276 ctx->atomic = true; 277} 278 279static void 280instr_atomic_end(struct ir3_compile_context *ctx) 281{ 282 ctx->atomic = false; 283 instr_finish(ctx); 284} 285 286static struct ir3_instruction * 287instr_create(struct ir3_compile_context *ctx, int category, opc_t opc) 288{ 289 instr_finish(ctx); 290 return (ctx->current_instr = ir3_instr_create(ctx->block, category, opc)); 291} 292 293static struct ir3_instruction * 294instr_clone(struct ir3_compile_context *ctx, struct ir3_instruction *instr) 295{ 296 instr_finish(ctx); 297 return (ctx->current_instr = ir3_instr_clone(instr)); 298} 299 300static struct ir3_block * 301push_block(struct ir3_compile_context *ctx) 302{ 303 struct ir3_block *block; 304 unsigned ntmp, nin, nout; 305 306#define SCALAR_REGS(file) (4 * (ctx->info.file_max[TGSI_FILE_ ## file] + 1)) 307 308 /* hmm, give ourselves room to create 8 extra temporaries (vec4): 309 */ 310 ntmp = SCALAR_REGS(TEMPORARY); 311 ntmp += 8 * 4; 312 313 nout = SCALAR_REGS(OUTPUT); 314 nin = SCALAR_REGS(INPUT); 315 316 /* for outermost block, 'inputs' are the actual shader INPUT 317 * register file. Reads from INPUT registers always go back to 318 * top block. For nested blocks, 'inputs' is used to track any 319 * TEMPORARY file register from one of the enclosing blocks that 320 * is ready in this block. 321 */ 322 if (!ctx->block) { 323 /* NOTE: fragment shaders actually have two inputs (r0.xy, the 324 * position) 325 */ 326 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { 327 int n = 2; 328 if (ctx->info.reads_position) 329 n += 4; 330 if (ctx->info.uses_frontface) 331 n += 4; 332 nin = MAX2(n, nin); 333 nout += ARRAY_SIZE(ctx->kill); 334 } 335 } else { 336 nin = ntmp; 337 } 338 339 block = ir3_block_create(ctx->ir, ntmp, nin, nout); 340 341 if ((ctx->type == TGSI_PROCESSOR_FRAGMENT) && !ctx->block) 342 block->noutputs -= ARRAY_SIZE(ctx->kill); 343 344 block->parent = ctx->block; 345 ctx->block = block; 346 347 return block; 348} 349 350static void 351pop_block(struct ir3_compile_context *ctx) 352{ 353 ctx->block = ctx->block->parent; 354 compile_assert(ctx, ctx->block); 355} 356 357static struct ir3_instruction * 358create_output(struct ir3_block *block, struct ir3_instruction *instr, 359 unsigned n) 360{ 361 struct ir3_instruction *out; 362 363 out = ir3_instr_create(block, -1, OPC_META_OUTPUT); 364 out->inout.block = block; 365 ir3_reg_create(out, n, 0); 366 if (instr) 367 ir3_reg_create(out, 0, IR3_REG_SSA)->instr = instr; 368 369 return out; 370} 371 372static struct ir3_instruction * 373create_input(struct ir3_block *block, struct ir3_instruction *instr, 374 unsigned n) 375{ 376 struct ir3_instruction *in; 377 378 in = ir3_instr_create(block, -1, OPC_META_INPUT); 379 in->inout.block = block; 380 ir3_reg_create(in, n, 0); 381 if (instr) 382 ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr; 383 384 return in; 385} 386 387static struct ir3_instruction * 388block_input(struct ir3_block *block, unsigned n) 389{ 390 /* references to INPUT register file always go back up to 391 * top level: 392 */ 393 if (block->parent) 394 return block_input(block->parent, n); 395 return block->inputs[n]; 396} 397 398/* return temporary in scope, creating if needed meta-input node 399 * to track block inputs 400 */ 401static struct ir3_instruction * 402block_temporary(struct ir3_block *block, unsigned n) 403{ 404 /* references to TEMPORARY register file, find the nearest 405 * enclosing block which has already assigned this temporary, 406 * creating meta-input instructions along the way to keep 407 * track of block inputs 408 */ 409 if (block->parent && !block->temporaries[n]) { 410 /* if already have input for this block, reuse: */ 411 if (!block->inputs[n]) 412 block->inputs[n] = block_temporary(block->parent, n); 413 414 /* and create new input to return: */ 415 return create_input(block, block->inputs[n], n); 416 } 417 return block->temporaries[n]; 418} 419 420static struct ir3_instruction * 421create_immed(struct ir3_compile_context *ctx, float val) 422{ 423 /* NOTE: *don't* use instr_create() here! 424 */ 425 struct ir3_instruction *instr; 426 instr = ir3_instr_create(ctx->block, 1, 0); 427 instr->cat1.src_type = get_ftype(ctx); 428 instr->cat1.dst_type = get_ftype(ctx); 429 ir3_reg_create(instr, 0, 0); 430 ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = val; 431 return instr; 432} 433 434static void 435ssa_dst(struct ir3_compile_context *ctx, struct ir3_instruction *instr, 436 const struct tgsi_dst_register *dst, unsigned chan) 437{ 438 unsigned n = regid(dst->Index, chan); 439 unsigned idx = ctx->num_output_updates; 440 441 compile_assert(ctx, idx < ARRAY_SIZE(ctx->output_updates)); 442 443 /* NOTE: defer update of temporaries[idx] or output[idx] 444 * until instr_finish(), so that if the current instruction 445 * reads the same TEMP/OUT[] it gets the old value: 446 * 447 * bleh.. this might be a bit easier to just figure out 448 * in instr_finish(). But at that point we've already 449 * lost information about OUTPUT vs TEMPORARY register 450 * file.. 451 */ 452 453 switch (dst->File) { 454 case TGSI_FILE_OUTPUT: 455 compile_assert(ctx, n < ctx->block->noutputs); 456 ctx->output_updates[idx].instrp = &ctx->block->outputs[n]; 457 ctx->output_updates[idx].instr = instr; 458 ctx->num_output_updates++; 459 break; 460 case TGSI_FILE_TEMPORARY: 461 compile_assert(ctx, n < ctx->block->ntemporaries); 462 ctx->output_updates[idx].instrp = &ctx->block->temporaries[n]; 463 ctx->output_updates[idx].instr = instr; 464 ctx->num_output_updates++; 465 break; 466 case TGSI_FILE_ADDRESS: 467 compile_assert(ctx, n < 1); 468 ctx->output_updates[idx].instrp = &ctx->block->address; 469 ctx->output_updates[idx].instr = instr; 470 ctx->num_output_updates++; 471 break; 472 } 473} 474 475static void 476ssa_src(struct ir3_compile_context *ctx, struct ir3_register *reg, 477 const struct tgsi_src_register *src, unsigned chan) 478{ 479 struct ir3_block *block = ctx->block; 480 unsigned n = regid(src->Index, chan); 481 482 switch (src->File) { 483 case TGSI_FILE_INPUT: 484 reg->flags |= IR3_REG_SSA; 485 reg->instr = block_input(ctx->block, n); 486 break; 487 case TGSI_FILE_OUTPUT: 488 /* really this should just happen in case of 'MOV_SAT OUT[n], ..', 489 * for the following clamp instructions: 490 */ 491 reg->flags |= IR3_REG_SSA; 492 reg->instr = block->outputs[n]; 493 /* we don't have to worry about read from an OUTPUT that was 494 * assigned outside of the current block, because the _SAT 495 * clamp instructions will always be in the same block as 496 * the original instruction which wrote the OUTPUT 497 */ 498 compile_assert(ctx, reg->instr); 499 break; 500 case TGSI_FILE_TEMPORARY: 501 reg->flags |= IR3_REG_SSA; 502 reg->instr = block_temporary(ctx->block, n); 503 break; 504 } 505 506 if ((reg->flags & IR3_REG_SSA) && !reg->instr) { 507 /* this can happen when registers (or components of a TGSI 508 * register) are used as src before they have been assigned 509 * (undefined contents). To avoid confusing the rest of the 510 * compiler, and to generally keep things peachy, substitute 511 * an instruction that sets the src to 0.0. Or to keep 512 * things undefined, I could plug in a random number? :-P 513 * 514 * NOTE: *don't* use instr_create() here! 515 */ 516 reg->instr = create_immed(ctx, 0.0); 517 } 518} 519 520static struct ir3_register * 521add_dst_reg_wrmask(struct ir3_compile_context *ctx, 522 struct ir3_instruction *instr, const struct tgsi_dst_register *dst, 523 unsigned chan, unsigned wrmask) 524{ 525 unsigned flags = 0, num = 0; 526 struct ir3_register *reg; 527 528 switch (dst->File) { 529 case TGSI_FILE_OUTPUT: 530 case TGSI_FILE_TEMPORARY: 531 /* uses SSA */ 532 break; 533 case TGSI_FILE_ADDRESS: 534 flags |= IR3_REG_ADDR; 535 /* uses SSA */ 536 break; 537 default: 538 compile_error(ctx, "unsupported dst register file: %s\n", 539 tgsi_file_name(dst->File)); 540 break; 541 } 542 543 if (dst->Indirect) 544 flags |= IR3_REG_RELATIV; 545 546 reg = ir3_reg_create(instr, regid(num, chan), flags); 547 548 /* NOTE: do not call ssa_dst() if atomic.. vectorize() 549 * itself will call ssa_dst(). This is to filter out 550 * the (initially bogus) .x component dst which is 551 * created (but not necessarily used, ie. if the net 552 * result of the vector operation does not write to 553 * the .x component) 554 */ 555 556 reg->wrmask = wrmask; 557 if (wrmask == 0x1) { 558 /* normal case */ 559 if (!ctx->atomic) 560 ssa_dst(ctx, instr, dst, chan); 561 } else if ((dst->File == TGSI_FILE_TEMPORARY) || 562 (dst->File == TGSI_FILE_OUTPUT) || 563 (dst->File == TGSI_FILE_ADDRESS)) { 564 unsigned i; 565 566 /* if instruction writes multiple, we need to create 567 * some place-holder collect the registers: 568 */ 569 for (i = 0; i < 4; i++) { 570 if (wrmask & (1 << i)) { 571 struct ir3_instruction *collect = 572 ir3_instr_create(ctx->block, -1, OPC_META_FO); 573 collect->fo.off = i; 574 /* unused dst reg: */ 575 ir3_reg_create(collect, 0, 0); 576 /* and src reg used to hold original instr */ 577 ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = instr; 578 if (!ctx->atomic) 579 ssa_dst(ctx, collect, dst, chan+i); 580 } 581 } 582 } 583 584 return reg; 585} 586 587static struct ir3_register * 588add_dst_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr, 589 const struct tgsi_dst_register *dst, unsigned chan) 590{ 591 return add_dst_reg_wrmask(ctx, instr, dst, chan, 0x1); 592} 593 594static struct ir3_register * 595add_src_reg_wrmask(struct ir3_compile_context *ctx, 596 struct ir3_instruction *instr, const struct tgsi_src_register *src, 597 unsigned chan, unsigned wrmask) 598{ 599 unsigned flags = 0, num = 0; 600 struct ir3_register *reg; 601 struct ir3_instruction *orig = NULL; 602 603 switch (src->File) { 604 case TGSI_FILE_IMMEDIATE: 605 /* TODO if possible, use actual immediate instead of const.. but 606 * TGSI has vec4 immediates, we can only embed scalar (of limited 607 * size, depending on instruction..) 608 */ 609 flags |= IR3_REG_CONST; 610 num = src->Index + ctx->so->first_immediate; 611 break; 612 case TGSI_FILE_CONSTANT: 613 flags |= IR3_REG_CONST; 614 num = src->Index; 615 break; 616 case TGSI_FILE_OUTPUT: 617 /* NOTE: we should only end up w/ OUTPUT file for things like 618 * clamp()'ing saturated dst instructions 619 */ 620 case TGSI_FILE_INPUT: 621 case TGSI_FILE_TEMPORARY: 622 /* uses SSA */ 623 break; 624 default: 625 compile_error(ctx, "unsupported src register file: %s\n", 626 tgsi_file_name(src->File)); 627 break; 628 } 629 630 /* We seem to have 8 bits (6.2) for dst register always, so I think 631 * it is safe to assume GPR cannot be >=64 632 * 633 * cat3 instructions only have 8 bits for src2, but cannot take a 634 * const for src2 635 * 636 * cat5 and cat6 in some cases only has 8 bits, but cannot take a 637 * const for any src. 638 * 639 * Other than that we seem to have 12 bits to encode const src, 640 * except for cat1 which may only have 11 bits (but that seems like 641 * a bug) 642 */ 643 if (flags & IR3_REG_CONST) 644 compile_assert(ctx, src->Index < (1 << 9)); 645 else 646 compile_assert(ctx, src->Index < (1 << 6)); 647 648 if (src->Absolute) 649 flags |= IR3_REG_ABS; 650 if (src->Negate) 651 flags |= IR3_REG_NEGATE; 652 653 if (src->Indirect) { 654 flags |= IR3_REG_RELATIV; 655 656 /* shouldn't happen, and we can't cope with it below: */ 657 compile_assert(ctx, wrmask == 0x1); 658 659 /* wrap in a meta-deref to track both the src and address: */ 660 orig = instr; 661 662 instr = ir3_instr_create(ctx->block, -1, OPC_META_DEREF); 663 ir3_reg_create(instr, 0, 0); 664 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->block->address; 665 } 666 667 reg = ir3_reg_create(instr, regid(num, chan), flags); 668 669 reg->wrmask = wrmask; 670 if (wrmask == 0x1) { 671 /* normal case */ 672 ssa_src(ctx, reg, src, chan); 673 } else if ((src->File == TGSI_FILE_TEMPORARY) || 674 (src->File == TGSI_FILE_OUTPUT) || 675 (src->File == TGSI_FILE_INPUT)) { 676 struct ir3_instruction *collect; 677 unsigned i; 678 679 compile_assert(ctx, !src->Indirect); 680 681 /* if instruction reads multiple, we need to create 682 * some place-holder collect the registers: 683 */ 684 collect = ir3_instr_create(ctx->block, -1, OPC_META_FI); 685 ir3_reg_create(collect, 0, 0); /* unused dst reg */ 686 687 for (i = 0; i < 4; i++) { 688 if (wrmask & (1 << i)) { 689 /* and src reg used point to the original instr */ 690 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), 691 src, chan + i); 692 } else if (wrmask & ~((i << i) - 1)) { 693 /* if any remaining components, then dummy 694 * placeholder src reg to fill in the blanks: 695 */ 696 ir3_reg_create(collect, 0, 0); 697 } 698 } 699 700 reg->flags |= IR3_REG_SSA; 701 reg->instr = collect; 702 } 703 704 if (src->Indirect) { 705 reg = ir3_reg_create(orig, 0, flags | IR3_REG_SSA); 706 reg->instr = instr; 707 } 708 return reg; 709} 710 711static struct ir3_register * 712add_src_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr, 713 const struct tgsi_src_register *src, unsigned chan) 714{ 715 return add_src_reg_wrmask(ctx, instr, src, chan, 0x1); 716} 717 718static void 719src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst) 720{ 721 src->File = dst->File; 722 src->Indirect = dst->Indirect; 723 src->Dimension = dst->Dimension; 724 src->Index = dst->Index; 725 src->Absolute = 0; 726 src->Negate = 0; 727 src->SwizzleX = TGSI_SWIZZLE_X; 728 src->SwizzleY = TGSI_SWIZZLE_Y; 729 src->SwizzleZ = TGSI_SWIZZLE_Z; 730 src->SwizzleW = TGSI_SWIZZLE_W; 731} 732 733/* Get internal-temp src/dst to use for a sequence of instructions 734 * generated by a single TGSI op. 735 */ 736static struct tgsi_src_register * 737get_internal_temp(struct ir3_compile_context *ctx, 738 struct tgsi_dst_register *tmp_dst) 739{ 740 struct tgsi_src_register *tmp_src; 741 int n; 742 743 tmp_dst->File = TGSI_FILE_TEMPORARY; 744 tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW; 745 tmp_dst->Indirect = 0; 746 tmp_dst->Dimension = 0; 747 748 /* assign next temporary: */ 749 n = ctx->num_internal_temps++; 750 compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps)); 751 tmp_src = &ctx->internal_temps[n]; 752 753 tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1; 754 755 src_from_dst(tmp_src, tmp_dst); 756 757 return tmp_src; 758} 759 760static inline bool 761is_const(struct tgsi_src_register *src) 762{ 763 return (src->File == TGSI_FILE_CONSTANT) || 764 (src->File == TGSI_FILE_IMMEDIATE); 765} 766 767static inline bool 768is_relative(struct tgsi_src_register *src) 769{ 770 return src->Indirect; 771} 772 773static inline bool 774is_rel_or_const(struct tgsi_src_register *src) 775{ 776 return is_relative(src) || is_const(src); 777} 778 779static type_t 780get_ftype(struct ir3_compile_context *ctx) 781{ 782 return TYPE_F32; 783} 784 785static type_t 786get_utype(struct ir3_compile_context *ctx) 787{ 788 return TYPE_U32; 789} 790 791static type_t 792get_stype(struct ir3_compile_context *ctx) 793{ 794 return TYPE_S32; 795} 796 797static unsigned 798src_swiz(struct tgsi_src_register *src, int chan) 799{ 800 switch (chan) { 801 case 0: return src->SwizzleX; 802 case 1: return src->SwizzleY; 803 case 2: return src->SwizzleZ; 804 case 3: return src->SwizzleW; 805 } 806 assert(0); 807 return 0; 808} 809 810/* for instructions that cannot take a const register as src, if needed 811 * generate a move to temporary gpr: 812 */ 813static struct tgsi_src_register * 814get_unconst(struct ir3_compile_context *ctx, struct tgsi_src_register *src) 815{ 816 struct tgsi_dst_register tmp_dst; 817 struct tgsi_src_register *tmp_src; 818 819 compile_assert(ctx, is_rel_or_const(src)); 820 821 tmp_src = get_internal_temp(ctx, &tmp_dst); 822 823 create_mov(ctx, &tmp_dst, src); 824 825 return tmp_src; 826} 827 828static void 829get_immediate(struct ir3_compile_context *ctx, 830 struct tgsi_src_register *reg, uint32_t val) 831{ 832 unsigned neg, swiz, idx, i; 833 /* actually maps 1:1 currently.. not sure if that is safe to rely on: */ 834 static const unsigned swiz2tgsi[] = { 835 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W, 836 }; 837 838 for (i = 0; i < ctx->immediate_idx; i++) { 839 swiz = i % 4; 840 idx = i / 4; 841 842 if (ctx->so->immediates[idx].val[swiz] == val) { 843 neg = 0; 844 break; 845 } 846 847 if (ctx->so->immediates[idx].val[swiz] == -val) { 848 neg = 1; 849 break; 850 } 851 } 852 853 if (i == ctx->immediate_idx) { 854 /* need to generate a new immediate: */ 855 swiz = i % 4; 856 idx = i / 4; 857 neg = 0; 858 ctx->so->immediates[idx].val[swiz] = val; 859 ctx->so->immediates_count = idx + 1; 860 ctx->immediate_idx++; 861 } 862 863 reg->File = TGSI_FILE_IMMEDIATE; 864 reg->Indirect = 0; 865 reg->Dimension = 0; 866 reg->Index = idx; 867 reg->Absolute = 0; 868 reg->Negate = neg; 869 reg->SwizzleX = swiz2tgsi[swiz]; 870 reg->SwizzleY = swiz2tgsi[swiz]; 871 reg->SwizzleZ = swiz2tgsi[swiz]; 872 reg->SwizzleW = swiz2tgsi[swiz]; 873} 874 875static void 876create_mov(struct ir3_compile_context *ctx, struct tgsi_dst_register *dst, 877 struct tgsi_src_register *src) 878{ 879 type_t type_mov = get_ftype(ctx); 880 unsigned i; 881 882 for (i = 0; i < 4; i++) { 883 /* move to destination: */ 884 if (dst->WriteMask & (1 << i)) { 885 struct ir3_instruction *instr; 886 887 if (src->Absolute || src->Negate) { 888 /* can't have abs or neg on a mov instr, so use 889 * absneg.f instead to handle these cases: 890 */ 891 instr = instr_create(ctx, 2, OPC_ABSNEG_F); 892 } else { 893 instr = instr_create(ctx, 1, 0); 894 instr->cat1.src_type = type_mov; 895 instr->cat1.dst_type = type_mov; 896 } 897 898 add_dst_reg(ctx, instr, dst, i); 899 add_src_reg(ctx, instr, src, src_swiz(src, i)); 900 } 901 } 902} 903 904static void 905create_clamp(struct ir3_compile_context *ctx, 906 struct tgsi_dst_register *dst, struct tgsi_src_register *val, 907 struct tgsi_src_register *minval, struct tgsi_src_register *maxval) 908{ 909 struct ir3_instruction *instr; 910 911 instr = instr_create(ctx, 2, OPC_MAX_F); 912 vectorize(ctx, instr, dst, 2, val, 0, minval, 0); 913 914 instr = instr_create(ctx, 2, OPC_MIN_F); 915 vectorize(ctx, instr, dst, 2, val, 0, maxval, 0); 916} 917 918static void 919create_clamp_imm(struct ir3_compile_context *ctx, 920 struct tgsi_dst_register *dst, 921 uint32_t minval, uint32_t maxval) 922{ 923 struct tgsi_src_register minconst, maxconst; 924 struct tgsi_src_register src; 925 926 src_from_dst(&src, dst); 927 928 get_immediate(ctx, &minconst, minval); 929 get_immediate(ctx, &maxconst, maxval); 930 931 create_clamp(ctx, dst, &src, &minconst, &maxconst); 932} 933 934static struct tgsi_dst_register * 935get_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst) 936{ 937 struct tgsi_dst_register *dst = &inst->Dst[0].Register; 938 unsigned i; 939 940 compile_assert(ctx, !ctx->using_tmp_dst); 941 ctx->using_tmp_dst = true; 942 943 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 944 struct tgsi_src_register *src = &inst->Src[i].Register; 945 if ((src->File == dst->File) && (src->Index == dst->Index)) { 946 if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) && 947 (src->SwizzleX == TGSI_SWIZZLE_X) && 948 (src->SwizzleY == TGSI_SWIZZLE_Y) && 949 (src->SwizzleZ == TGSI_SWIZZLE_Z) && 950 (src->SwizzleW == TGSI_SWIZZLE_W)) 951 continue; 952 ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst); 953 ctx->tmp_dst.WriteMask = dst->WriteMask; 954 dst = &ctx->tmp_dst; 955 break; 956 } 957 } 958 return dst; 959} 960 961static void 962put_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst, 963 struct tgsi_dst_register *dst) 964{ 965 compile_assert(ctx, ctx->using_tmp_dst); 966 ctx->using_tmp_dst = false; 967 968 /* if necessary, add mov back into original dst: */ 969 if (dst != &inst->Dst[0].Register) { 970 create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src); 971 } 972} 973 974/* helper to generate the necessary repeat and/or additional instructions 975 * to turn a scalar instruction into a vector operation: 976 */ 977static void 978vectorize(struct ir3_compile_context *ctx, struct ir3_instruction *instr, 979 struct tgsi_dst_register *dst, int nsrcs, ...) 980{ 981 va_list ap; 982 int i, j, n = 0; 983 984 instr_atomic_start(ctx); 985 986 add_dst_reg(ctx, instr, dst, TGSI_SWIZZLE_X); 987 988 va_start(ap, nsrcs); 989 for (j = 0; j < nsrcs; j++) { 990 struct tgsi_src_register *src = 991 va_arg(ap, struct tgsi_src_register *); 992 unsigned flags = va_arg(ap, unsigned); 993 struct ir3_register *reg; 994 if (flags & IR3_REG_IMMED) { 995 reg = ir3_reg_create(instr, 0, IR3_REG_IMMED); 996 /* this is an ugly cast.. should have put flags first! */ 997 reg->iim_val = *(int *)&src; 998 } else { 999 reg = add_src_reg(ctx, instr, src, TGSI_SWIZZLE_X); 1000 } 1001 reg->flags |= flags & ~IR3_REG_NEGATE; 1002 if (flags & IR3_REG_NEGATE) 1003 reg->flags ^= IR3_REG_NEGATE; 1004 } 1005 va_end(ap); 1006 1007 for (i = 0; i < 4; i++) { 1008 if (dst->WriteMask & (1 << i)) { 1009 struct ir3_instruction *cur; 1010 1011 if (n++ == 0) { 1012 cur = instr; 1013 } else { 1014 cur = instr_clone(ctx, instr); 1015 } 1016 1017 ssa_dst(ctx, cur, dst, i); 1018 1019 /* fix-up dst register component: */ 1020 cur->regs[0]->num = regid(cur->regs[0]->num >> 2, i); 1021 1022 /* fix-up src register component: */ 1023 va_start(ap, nsrcs); 1024 for (j = 0; j < nsrcs; j++) { 1025 struct ir3_register *reg = cur->regs[j+1]; 1026 struct tgsi_src_register *src = 1027 va_arg(ap, struct tgsi_src_register *); 1028 unsigned flags = va_arg(ap, unsigned); 1029 if (reg->flags & IR3_REG_SSA) { 1030 ssa_src(ctx, reg, src, src_swiz(src, i)); 1031 } else if (!(flags & IR3_REG_IMMED)) { 1032 reg->num = regid(reg->num >> 2, src_swiz(src, i)); 1033 } 1034 } 1035 va_end(ap); 1036 } 1037 } 1038 1039 instr_atomic_end(ctx); 1040} 1041 1042/* 1043 * Handlers for TGSI instructions which do not have a 1:1 mapping to 1044 * native instructions: 1045 */ 1046 1047static void 1048trans_clamp(const struct instr_translater *t, 1049 struct ir3_compile_context *ctx, 1050 struct tgsi_full_instruction *inst) 1051{ 1052 struct tgsi_dst_register *dst = get_dst(ctx, inst); 1053 struct tgsi_src_register *src0 = &inst->Src[0].Register; 1054 struct tgsi_src_register *src1 = &inst->Src[1].Register; 1055 struct tgsi_src_register *src2 = &inst->Src[2].Register; 1056 1057 create_clamp(ctx, dst, src0, src1, src2); 1058 1059 put_dst(ctx, inst, dst); 1060} 1061 1062/* ARL(x) = x, but mova from hrN.x to a0.. */ 1063static void 1064trans_arl(const struct instr_translater *t, 1065 struct ir3_compile_context *ctx, 1066 struct tgsi_full_instruction *inst) 1067{ 1068 struct ir3_instruction *instr; 1069 struct tgsi_dst_register tmp_dst; 1070 struct tgsi_src_register *tmp_src; 1071 struct tgsi_dst_register *dst = &inst->Dst[0].Register; 1072 struct tgsi_src_register *src = &inst->Src[0].Register; 1073 unsigned chan = src->SwizzleX; 1074 1075 compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS); 1076 1077 /* NOTE: we allocate a temporary from a flat register 1078 * namespace (ignoring half vs full). It turns out 1079 * not to really matter since registers get reassigned 1080 * later in ir3_ra which (hopefully!) can deal a bit 1081 * better with mixed half and full precision. 1082 */ 1083 tmp_src = get_internal_temp(ctx, &tmp_dst); 1084 1085 /* cov.{u,f}{32,16}s16 Rtmp, Rsrc */ 1086 instr = instr_create(ctx, 1, 0); 1087 instr->cat1.src_type = (t->tgsi_opc == TGSI_OPCODE_ARL) ? 1088 get_ftype(ctx) : get_utype(ctx); 1089 instr->cat1.dst_type = TYPE_S16; 1090 add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF; 1091 add_src_reg(ctx, instr, src, chan); 1092 1093 /* shl.b Rtmp, Rtmp, 2 */ 1094 instr = instr_create(ctx, 2, OPC_SHL_B); 1095 add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF; 1096 add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF; 1097 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2; 1098 1099 /* mova a0, Rtmp */ 1100 instr = instr_create(ctx, 1, 0); 1101 instr->cat1.src_type = TYPE_S16; 1102 instr->cat1.dst_type = TYPE_S16; 1103 add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF; 1104 add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF; 1105} 1106 1107/* 1108 * texture fetch/sample instructions: 1109 */ 1110 1111struct tex_info { 1112 int8_t order[4]; 1113 int8_t args; 1114 unsigned src_wrmask, flags; 1115}; 1116 1117struct target_info { 1118 uint8_t dims; 1119 uint8_t cube; 1120 uint8_t array; 1121 uint8_t shadow; 1122}; 1123 1124static const struct target_info tex_targets[] = { 1125 [TGSI_TEXTURE_1D] = { 1, 0, 0, 0 }, 1126 [TGSI_TEXTURE_2D] = { 2, 0, 0, 0 }, 1127 [TGSI_TEXTURE_3D] = { 3, 0, 0, 0 }, 1128 [TGSI_TEXTURE_CUBE] = { 3, 1, 0, 0 }, 1129 [TGSI_TEXTURE_RECT] = { 2, 0, 0, 0 }, 1130 [TGSI_TEXTURE_SHADOW1D] = { 1, 0, 0, 1 }, 1131 [TGSI_TEXTURE_SHADOW2D] = { 2, 0, 0, 1 }, 1132 [TGSI_TEXTURE_SHADOWRECT] = { 2, 0, 0, 1 }, 1133 [TGSI_TEXTURE_1D_ARRAY] = { 1, 0, 1, 0 }, 1134 [TGSI_TEXTURE_2D_ARRAY] = { 2, 0, 1, 0 }, 1135 [TGSI_TEXTURE_SHADOW1D_ARRAY] = { 1, 0, 1, 1 }, 1136 [TGSI_TEXTURE_SHADOW2D_ARRAY] = { 2, 0, 1, 1 }, 1137 [TGSI_TEXTURE_SHADOWCUBE] = { 3, 1, 0, 1 }, 1138 [TGSI_TEXTURE_2D_MSAA] = { 2, 0, 0, 0 }, 1139 [TGSI_TEXTURE_2D_ARRAY_MSAA] = { 2, 0, 1, 0 }, 1140 [TGSI_TEXTURE_CUBE_ARRAY] = { 3, 1, 1, 0 }, 1141 [TGSI_TEXTURE_SHADOWCUBE_ARRAY] = { 3, 1, 1, 1 }, 1142}; 1143 1144static void 1145fill_tex_info(struct ir3_compile_context *ctx, 1146 struct tgsi_full_instruction *inst, 1147 struct tex_info *info) 1148{ 1149 const struct target_info *tgt = &tex_targets[inst->Texture.Texture]; 1150 1151 if (tgt->dims == 3) 1152 info->flags |= IR3_INSTR_3D; 1153 if (tgt->array) 1154 info->flags |= IR3_INSTR_A; 1155 if (tgt->shadow) 1156 info->flags |= IR3_INSTR_S; 1157 1158 switch (inst->Instruction.Opcode) { 1159 case TGSI_OPCODE_TXB: 1160 case TGSI_OPCODE_TXB2: 1161 case TGSI_OPCODE_TXL: 1162 case TGSI_OPCODE_TXF: 1163 info->args = 2; 1164 break; 1165 case TGSI_OPCODE_TXP: 1166 info->flags |= IR3_INSTR_P; 1167 /* fallthrough */ 1168 case TGSI_OPCODE_TEX: 1169 case TGSI_OPCODE_TXD: 1170 info->args = 1; 1171 break; 1172 } 1173 1174 /* 1175 * lay out the first argument in the proper order: 1176 * - actual coordinates first 1177 * - array index 1178 * - shadow reference 1179 * - projection w 1180 * 1181 * bias/lod go into the second arg 1182 */ 1183 int arg, pos = 0; 1184 for (arg = 0; arg < tgt->dims; arg++) 1185 info->order[arg] = pos++; 1186 if (tgt->dims == 1) 1187 info->order[pos++] = -1; 1188 if (tgt->shadow) 1189 info->order[pos++] = MAX2(arg + tgt->array, 2); 1190 if (tgt->array) 1191 info->order[pos++] = arg++; 1192 if (info->flags & IR3_INSTR_P) 1193 info->order[pos++] = 3; 1194 1195 info->src_wrmask = (1 << pos) - 1; 1196 1197 for (; pos < 4; pos++) 1198 info->order[pos] = -1; 1199 1200 assert(pos <= 4); 1201} 1202 1203static bool check_swiz(struct tgsi_src_register *src, const int8_t order[4]) 1204{ 1205 unsigned i; 1206 for (i = 1; (i < 4) && order[i] >= 0; i++) 1207 if (src_swiz(src, i) != (src_swiz(src, 0) + order[i])) 1208 return false; 1209 return true; 1210} 1211 1212static bool is_1d(unsigned tex) 1213{ 1214 return tex_targets[tex].dims == 1; 1215} 1216 1217static struct tgsi_src_register * 1218get_tex_coord(struct ir3_compile_context *ctx, 1219 struct tgsi_full_instruction *inst, 1220 const struct tex_info *tinf) 1221{ 1222 struct tgsi_src_register *coord = &inst->Src[0].Register; 1223 struct ir3_instruction *instr; 1224 unsigned tex = inst->Texture.Texture; 1225 struct tgsi_dst_register tmp_dst; 1226 struct tgsi_src_register *tmp_src; 1227 type_t type_mov = get_ftype(ctx); 1228 unsigned j; 1229 1230 /* need to move things around: */ 1231 tmp_src = get_internal_temp(ctx, &tmp_dst); 1232 1233 for (j = 0; j < 4; j++) { 1234 if (tinf->order[j] < 0) 1235 continue; 1236 instr = instr_create(ctx, 1, 0); /* mov */ 1237 instr->cat1.src_type = type_mov; 1238 instr->cat1.dst_type = type_mov; 1239 add_dst_reg(ctx, instr, &tmp_dst, j); 1240 add_src_reg(ctx, instr, coord, 1241 src_swiz(coord, tinf->order[j])); 1242 } 1243 1244 /* fix up .y coord: */ 1245 if (is_1d(tex)) { 1246 struct ir3_register *imm; 1247 instr = instr_create(ctx, 1, 0); /* mov */ 1248 instr->cat1.src_type = type_mov; 1249 instr->cat1.dst_type = type_mov; 1250 add_dst_reg(ctx, instr, &tmp_dst, 1); /* .y */ 1251 imm = ir3_reg_create(instr, 0, IR3_REG_IMMED); 1252 if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) 1253 imm->iim_val = 0; 1254 else 1255 imm->fim_val = 0.5; 1256 } 1257 1258 return tmp_src; 1259} 1260 1261static void 1262trans_samp(const struct instr_translater *t, 1263 struct ir3_compile_context *ctx, 1264 struct tgsi_full_instruction *inst) 1265{ 1266 struct ir3_instruction *instr, *collect; 1267 struct ir3_register *reg; 1268 struct tgsi_dst_register *dst = &inst->Dst[0].Register; 1269 struct tgsi_src_register *orig, *coord, *samp, *offset, *dpdx, *dpdy; 1270 struct tgsi_src_register zero; 1271 const struct target_info *tgt = &tex_targets[inst->Texture.Texture]; 1272 struct tex_info tinf; 1273 int i; 1274 1275 memset(&tinf, 0, sizeof(tinf)); 1276 fill_tex_info(ctx, inst, &tinf); 1277 coord = get_tex_coord(ctx, inst, &tinf); 1278 get_immediate(ctx, &zero, 0); 1279 1280 switch (inst->Instruction.Opcode) { 1281 case TGSI_OPCODE_TXB2: 1282 orig = &inst->Src[1].Register; 1283 samp = &inst->Src[2].Register; 1284 break; 1285 case TGSI_OPCODE_TXD: 1286 orig = &inst->Src[0].Register; 1287 dpdx = &inst->Src[1].Register; 1288 dpdy = &inst->Src[2].Register; 1289 samp = &inst->Src[3].Register; 1290 if (is_rel_or_const(dpdx)) 1291 dpdx = get_unconst(ctx, dpdx); 1292 if (is_rel_or_const(dpdy)) 1293 dpdy = get_unconst(ctx, dpdy); 1294 break; 1295 default: 1296 orig = &inst->Src[0].Register; 1297 samp = &inst->Src[1].Register; 1298 break; 1299 } 1300 if (tinf.args > 1 && is_rel_or_const(orig)) 1301 orig = get_unconst(ctx, orig); 1302 1303 /* scale up integer coords for TXF based on the LOD */ 1304 if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) { 1305 struct tgsi_dst_register tmp_dst; 1306 struct tgsi_src_register *tmp_src; 1307 type_t type_mov = get_utype(ctx); 1308 1309 tmp_src = get_internal_temp(ctx, &tmp_dst); 1310 for (i = 0; i < tgt->dims; i++) { 1311 instr = instr_create(ctx, 2, OPC_SHL_B); 1312 add_dst_reg(ctx, instr, &tmp_dst, i); 1313 add_src_reg(ctx, instr, coord, src_swiz(coord, i)); 1314 add_src_reg(ctx, instr, orig, orig->SwizzleW); 1315 } 1316 if (tgt->dims < 2) { 1317 instr = instr_create(ctx, 1, 0); 1318 instr->cat1.src_type = type_mov; 1319 instr->cat1.dst_type = type_mov; 1320 add_dst_reg(ctx, instr, &tmp_dst, i); 1321 add_src_reg(ctx, instr, &zero, 0); 1322 i++; 1323 } 1324 if (tgt->array) { 1325 instr = instr_create(ctx, 1, 0); 1326 instr->cat1.src_type = type_mov; 1327 instr->cat1.dst_type = type_mov; 1328 add_dst_reg(ctx, instr, &tmp_dst, i); 1329 add_src_reg(ctx, instr, coord, src_swiz(coord, i)); 1330 } 1331 coord = tmp_src; 1332 } 1333 1334 if (inst->Texture.NumOffsets) { 1335 struct tgsi_texture_offset *tex_offset = &inst->TexOffsets[0]; 1336 struct tgsi_src_register offset_src = {0}; 1337 1338 offset_src.File = tex_offset->File; 1339 offset_src.Index = tex_offset->Index; 1340 offset_src.SwizzleX = tex_offset->SwizzleX; 1341 offset_src.SwizzleY = tex_offset->SwizzleY; 1342 offset_src.SwizzleZ = tex_offset->SwizzleZ; 1343 offset = get_unconst(ctx, &offset_src); 1344 tinf.flags |= IR3_INSTR_O; 1345 } 1346 1347 instr = instr_create(ctx, 5, t->opc); 1348 instr->cat5.type = get_ftype(ctx); 1349 instr->cat5.samp = samp->Index; 1350 instr->cat5.tex = samp->Index; 1351 instr->flags |= tinf.flags; 1352 1353 add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask); 1354 1355 reg = ir3_reg_create(instr, 0, IR3_REG_SSA); 1356 1357 collect = ir3_instr_create(ctx->block, -1, OPC_META_FI); 1358 ir3_reg_create(collect, 0, 0); 1359 for (i = 0; i < 4; i++) 1360 if (tinf.src_wrmask & (1 << i)) 1361 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), 1362 coord, src_swiz(coord, i)); 1363 else if (tinf.src_wrmask & ~((1 << i) - 1)) 1364 ir3_reg_create(collect, 0, 0); 1365 1366 /* Attach derivatives onto the end of the fan-in. Derivatives start after 1367 * the 4th argument, so make sure that fi is padded up to 4 first. 1368 */ 1369 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) { 1370 while (collect->regs_count < 5) 1371 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0); 1372 for (i = 0; i < tgt->dims; i++) 1373 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), dpdx, i); 1374 if (tgt->dims < 2) 1375 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0); 1376 for (i = 0; i < tgt->dims; i++) 1377 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), dpdy, i); 1378 if (tgt->dims < 2) 1379 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0); 1380 tinf.src_wrmask |= ((1 << (2 * MAX2(tgt->dims, 2))) - 1) << 4; 1381 } 1382 1383 reg->instr = collect; 1384 reg->wrmask = tinf.src_wrmask; 1385 1386 /* The second argument contains the offsets, followed by the lod/bias 1387 * argument. This is constructed more manually due to the dynamic nature. 1388 */ 1389 if (inst->Texture.NumOffsets == 0 && tinf.args == 1) 1390 return; 1391 1392 reg = ir3_reg_create(instr, 0, IR3_REG_SSA); 1393 1394 collect = ir3_instr_create(ctx->block, -1, OPC_META_FI); 1395 ir3_reg_create(collect, 0, 0); 1396 1397 if (inst->Texture.NumOffsets) { 1398 for (i = 0; i < tgt->dims; i++) 1399 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), 1400 offset, i); 1401 if (tgt->dims < 2) 1402 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0); 1403 } 1404 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2) 1405 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), 1406 orig, orig->SwizzleX); 1407 else if (tinf.args > 1) 1408 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), 1409 orig, orig->SwizzleW); 1410 1411 reg->instr = collect; 1412 reg->wrmask = (1 << (collect->regs_count - 1)) - 1; 1413} 1414 1415static void 1416trans_txq(const struct instr_translater *t, 1417 struct ir3_compile_context *ctx, 1418 struct tgsi_full_instruction *inst) 1419{ 1420 struct ir3_instruction *instr; 1421 struct tgsi_dst_register *dst = &inst->Dst[0].Register; 1422 struct tgsi_src_register *level = &inst->Src[0].Register; 1423 struct tgsi_src_register *samp = &inst->Src[1].Register; 1424 struct tex_info tinf; 1425 1426 memset(&tinf, 0, sizeof(tinf)); 1427 fill_tex_info(ctx, inst, &tinf); 1428 if (is_rel_or_const(level)) 1429 level = get_unconst(ctx, level); 1430 1431 instr = instr_create(ctx, 5, OPC_GETSIZE); 1432 instr->cat5.type = get_utype(ctx); 1433 instr->cat5.samp = samp->Index; 1434 instr->cat5.tex = samp->Index; 1435 instr->flags |= tinf.flags; 1436 1437 add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask); 1438 add_src_reg_wrmask(ctx, instr, level, level->SwizzleX, 0x1); 1439} 1440 1441/* DDX/DDY */ 1442static void 1443trans_deriv(const struct instr_translater *t, 1444 struct ir3_compile_context *ctx, 1445 struct tgsi_full_instruction *inst) 1446{ 1447 struct ir3_instruction *instr; 1448 struct tgsi_dst_register *dst = &inst->Dst[0].Register; 1449 struct tgsi_src_register *src = &inst->Src[0].Register; 1450 static const int8_t order[4] = {0, 1, 2, 3}; 1451 1452 if (!check_swiz(src, order)) { 1453 struct tgsi_dst_register tmp_dst; 1454 struct tgsi_src_register *tmp_src; 1455 1456 tmp_src = get_internal_temp(ctx, &tmp_dst); 1457 create_mov(ctx, &tmp_dst, src); 1458 1459 src = tmp_src; 1460 } 1461 1462 /* This might be a workaround for hw bug? Blob compiler always 1463 * seems to work two components at a time for dsy/dsx. It does 1464 * actually seem to work in some cases (or at least some piglit 1465 * tests) for four components at a time. But seems more reliable 1466 * to split this into two instructions like the blob compiler 1467 * does: 1468 */ 1469 1470 instr = instr_create(ctx, 5, t->opc); 1471 instr->cat5.type = get_ftype(ctx); 1472 add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask & 0x3); 1473 add_src_reg_wrmask(ctx, instr, src, 0, dst->WriteMask & 0x3); 1474 1475 instr = instr_create(ctx, 5, t->opc); 1476 instr->cat5.type = get_ftype(ctx); 1477 add_dst_reg_wrmask(ctx, instr, dst, 2, (dst->WriteMask >> 2) & 0x3); 1478 add_src_reg_wrmask(ctx, instr, src, 2, (dst->WriteMask >> 2) & 0x3); 1479} 1480 1481/* 1482 * SEQ(a,b) = (a == b) ? 1.0 : 0.0 1483 * cmps.f.eq tmp0, a, b 1484 * cov.u16f16 dst, tmp0 1485 * 1486 * SNE(a,b) = (a != b) ? 1.0 : 0.0 1487 * cmps.f.ne tmp0, a, b 1488 * cov.u16f16 dst, tmp0 1489 * 1490 * SGE(a,b) = (a >= b) ? 1.0 : 0.0 1491 * cmps.f.ge tmp0, a, b 1492 * cov.u16f16 dst, tmp0 1493 * 1494 * SLE(a,b) = (a <= b) ? 1.0 : 0.0 1495 * cmps.f.le tmp0, a, b 1496 * cov.u16f16 dst, tmp0 1497 * 1498 * SGT(a,b) = (a > b) ? 1.0 : 0.0 1499 * cmps.f.gt tmp0, a, b 1500 * cov.u16f16 dst, tmp0 1501 * 1502 * SLT(a,b) = (a < b) ? 1.0 : 0.0 1503 * cmps.f.lt tmp0, a, b 1504 * cov.u16f16 dst, tmp0 1505 * 1506 * CMP(a,b,c) = (a < 0.0) ? b : c 1507 * cmps.f.lt tmp0, a, {0.0} 1508 * sel.b16 dst, b, tmp0, c 1509 */ 1510static void 1511trans_cmp(const struct instr_translater *t, 1512 struct ir3_compile_context *ctx, 1513 struct tgsi_full_instruction *inst) 1514{ 1515 struct ir3_instruction *instr; 1516 struct tgsi_dst_register tmp_dst; 1517 struct tgsi_src_register *tmp_src; 1518 struct tgsi_src_register constval0; 1519 /* final instruction for CMP() uses orig src1 and src2: */ 1520 struct tgsi_dst_register *dst = get_dst(ctx, inst); 1521 struct tgsi_src_register *a0, *a1, *a2; 1522 unsigned condition; 1523 1524 tmp_src = get_internal_temp(ctx, &tmp_dst); 1525 1526 a0 = &inst->Src[0].Register; /* a */ 1527 a1 = &inst->Src[1].Register; /* b */ 1528 1529 switch (t->tgsi_opc) { 1530 case TGSI_OPCODE_SEQ: 1531 case TGSI_OPCODE_FSEQ: 1532 condition = IR3_COND_EQ; 1533 break; 1534 case TGSI_OPCODE_SNE: 1535 case TGSI_OPCODE_FSNE: 1536 condition = IR3_COND_NE; 1537 break; 1538 case TGSI_OPCODE_SGE: 1539 case TGSI_OPCODE_FSGE: 1540 condition = IR3_COND_GE; 1541 break; 1542 case TGSI_OPCODE_SLT: 1543 case TGSI_OPCODE_FSLT: 1544 condition = IR3_COND_LT; 1545 break; 1546 case TGSI_OPCODE_SLE: 1547 condition = IR3_COND_LE; 1548 break; 1549 case TGSI_OPCODE_SGT: 1550 condition = IR3_COND_GT; 1551 break; 1552 case TGSI_OPCODE_CMP: 1553 get_immediate(ctx, &constval0, fui(0.0)); 1554 a0 = &inst->Src[0].Register; /* a */ 1555 a1 = &constval0; /* {0.0} */ 1556 condition = IR3_COND_LT; 1557 break; 1558 default: 1559 compile_assert(ctx, 0); 1560 return; 1561 } 1562 1563 if (is_const(a0) && is_const(a1)) 1564 a0 = get_unconst(ctx, a0); 1565 1566 /* cmps.f.<cond> tmp, a0, a1 */ 1567 instr = instr_create(ctx, 2, OPC_CMPS_F); 1568 instr->cat2.condition = condition; 1569 vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0); 1570 1571 switch (t->tgsi_opc) { 1572 case TGSI_OPCODE_SEQ: 1573 case TGSI_OPCODE_SGE: 1574 case TGSI_OPCODE_SLE: 1575 case TGSI_OPCODE_SNE: 1576 case TGSI_OPCODE_SGT: 1577 case TGSI_OPCODE_SLT: 1578 /* cov.u16f16 dst, tmp0 */ 1579 instr = instr_create(ctx, 1, 0); 1580 instr->cat1.src_type = get_utype(ctx); 1581 instr->cat1.dst_type = get_ftype(ctx); 1582 vectorize(ctx, instr, dst, 1, tmp_src, 0); 1583 break; 1584 case TGSI_OPCODE_FSEQ: 1585 case TGSI_OPCODE_FSGE: 1586 case TGSI_OPCODE_FSNE: 1587 case TGSI_OPCODE_FSLT: 1588 /* absneg.s dst, (neg)tmp0 */ 1589 instr = instr_create(ctx, 2, OPC_ABSNEG_S); 1590 vectorize(ctx, instr, dst, 1, tmp_src, IR3_REG_NEGATE); 1591 break; 1592 case TGSI_OPCODE_CMP: 1593 a1 = &inst->Src[1].Register; 1594 a2 = &inst->Src[2].Register; 1595 /* sel.{b32,b16} dst, src2, tmp, src1 */ 1596 instr = instr_create(ctx, 3, OPC_SEL_B32); 1597 vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0); 1598 1599 break; 1600 } 1601 1602 put_dst(ctx, inst, dst); 1603} 1604 1605/* 1606 * USNE(a,b) = (a != b) ? ~0 : 0 1607 * cmps.u32.ne dst, a, b 1608 * 1609 * USEQ(a,b) = (a == b) ? ~0 : 0 1610 * cmps.u32.eq dst, a, b 1611 * 1612 * ISGE(a,b) = (a > b) ? ~0 : 0 1613 * cmps.s32.ge dst, a, b 1614 * 1615 * USGE(a,b) = (a > b) ? ~0 : 0 1616 * cmps.u32.ge dst, a, b 1617 * 1618 * ISLT(a,b) = (a < b) ? ~0 : 0 1619 * cmps.s32.lt dst, a, b 1620 * 1621 * USLT(a,b) = (a < b) ? ~0 : 0 1622 * cmps.u32.lt dst, a, b 1623 * 1624 */ 1625static void 1626trans_icmp(const struct instr_translater *t, 1627 struct ir3_compile_context *ctx, 1628 struct tgsi_full_instruction *inst) 1629{ 1630 struct ir3_instruction *instr; 1631 struct tgsi_dst_register *dst = get_dst(ctx, inst); 1632 struct tgsi_dst_register tmp_dst; 1633 struct tgsi_src_register *tmp_src; 1634 struct tgsi_src_register *a0, *a1; 1635 unsigned condition; 1636 1637 a0 = &inst->Src[0].Register; /* a */ 1638 a1 = &inst->Src[1].Register; /* b */ 1639 1640 switch (t->tgsi_opc) { 1641 case TGSI_OPCODE_USNE: 1642 condition = IR3_COND_NE; 1643 break; 1644 case TGSI_OPCODE_USEQ: 1645 condition = IR3_COND_EQ; 1646 break; 1647 case TGSI_OPCODE_ISGE: 1648 case TGSI_OPCODE_USGE: 1649 condition = IR3_COND_GE; 1650 break; 1651 case TGSI_OPCODE_ISLT: 1652 case TGSI_OPCODE_USLT: 1653 condition = IR3_COND_LT; 1654 break; 1655 1656 default: 1657 compile_assert(ctx, 0); 1658 return; 1659 } 1660 1661 if (is_const(a0) && is_const(a1)) 1662 a0 = get_unconst(ctx, a0); 1663 1664 tmp_src = get_internal_temp(ctx, &tmp_dst); 1665 /* cmps.{u32,s32}.<cond> tmp, a0, a1 */ 1666 instr = instr_create(ctx, 2, t->opc); 1667 instr->cat2.condition = condition; 1668 vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0); 1669 1670 /* absneg.s dst, (neg)tmp */ 1671 instr = instr_create(ctx, 2, OPC_ABSNEG_S); 1672 vectorize(ctx, instr, dst, 1, tmp_src, IR3_REG_NEGATE); 1673 1674 put_dst(ctx, inst, dst); 1675} 1676 1677/* 1678 * UCMP(a,b,c) = a ? b : c 1679 * sel.b16 dst, b, a, c 1680 */ 1681static void 1682trans_ucmp(const struct instr_translater *t, 1683 struct ir3_compile_context *ctx, 1684 struct tgsi_full_instruction *inst) 1685{ 1686 struct ir3_instruction *instr; 1687 struct tgsi_dst_register *dst = get_dst(ctx, inst); 1688 struct tgsi_src_register *a0, *a1, *a2; 1689 1690 a0 = &inst->Src[0].Register; /* a */ 1691 a1 = &inst->Src[1].Register; /* b */ 1692 a2 = &inst->Src[2].Register; /* c */ 1693 1694 if (is_rel_or_const(a0)) 1695 a0 = get_unconst(ctx, a0); 1696 1697 /* sel.{b32,b16} dst, b, a, c */ 1698 instr = instr_create(ctx, 3, OPC_SEL_B32); 1699 vectorize(ctx, instr, dst, 3, a1, 0, a0, 0, a2, 0); 1700 put_dst(ctx, inst, dst); 1701} 1702 1703/* 1704 * ISSG(a) = a < 0 ? -1 : a > 0 ? 1 : 0 1705 * cmps.s.lt tmp_neg, a, 0 # 1 if a is negative 1706 * cmps.s.gt tmp_pos, a, 0 # 1 if a is positive 1707 * sub.u dst, tmp_pos, tmp_neg 1708 */ 1709static void 1710trans_issg(const struct instr_translater *t, 1711 struct ir3_compile_context *ctx, 1712 struct tgsi_full_instruction *inst) 1713{ 1714 struct ir3_instruction *instr; 1715 struct tgsi_dst_register *dst = get_dst(ctx, inst); 1716 struct tgsi_src_register *a = &inst->Src[0].Register; 1717 struct tgsi_dst_register neg_dst, pos_dst; 1718 struct tgsi_src_register *neg_src, *pos_src; 1719 1720 neg_src = get_internal_temp(ctx, &neg_dst); 1721 pos_src = get_internal_temp(ctx, &pos_dst); 1722 1723 /* cmps.s.lt neg, a, 0 */ 1724 instr = instr_create(ctx, 2, OPC_CMPS_S); 1725 instr->cat2.condition = IR3_COND_LT; 1726 vectorize(ctx, instr, &neg_dst, 2, a, 0, 0, IR3_REG_IMMED); 1727 1728 /* cmps.s.gt pos, a, 0 */ 1729 instr = instr_create(ctx, 2, OPC_CMPS_S); 1730 instr->cat2.condition = IR3_COND_GT; 1731 vectorize(ctx, instr, &pos_dst, 2, a, 0, 0, IR3_REG_IMMED); 1732 1733 /* sub.u dst, pos, neg */ 1734 instr = instr_create(ctx, 2, OPC_SUB_U); 1735 vectorize(ctx, instr, dst, 2, pos_src, 0, neg_src, 0); 1736 1737 put_dst(ctx, inst, dst); 1738} 1739 1740 1741 1742/* 1743 * Conditional / Flow control 1744 */ 1745 1746static void 1747push_branch(struct ir3_compile_context *ctx, bool inv, 1748 struct ir3_instruction *instr, struct ir3_instruction *cond) 1749{ 1750 unsigned int idx = ctx->branch_count++; 1751 compile_assert(ctx, idx < ARRAY_SIZE(ctx->branch)); 1752 ctx->branch[idx].instr = instr; 1753 ctx->branch[idx].inv = inv; 1754 /* else side of branch has same condition: */ 1755 if (!inv) 1756 ctx->branch[idx].cond = cond; 1757} 1758 1759static struct ir3_instruction * 1760pop_branch(struct ir3_compile_context *ctx) 1761{ 1762 unsigned int idx = --ctx->branch_count; 1763 return ctx->branch[idx].instr; 1764} 1765 1766static void 1767trans_if(const struct instr_translater *t, 1768 struct ir3_compile_context *ctx, 1769 struct tgsi_full_instruction *inst) 1770{ 1771 struct ir3_instruction *instr, *cond; 1772 struct tgsi_src_register *src = &inst->Src[0].Register; 1773 struct tgsi_dst_register tmp_dst; 1774 struct tgsi_src_register *tmp_src; 1775 struct tgsi_src_register constval; 1776 1777 get_immediate(ctx, &constval, fui(0.0)); 1778 tmp_src = get_internal_temp(ctx, &tmp_dst); 1779 1780 if (is_const(src)) 1781 src = get_unconst(ctx, src); 1782 1783 /* cmps.{f,u}.ne tmp0, b, {0.0} */ 1784 instr = instr_create(ctx, 2, t->opc); 1785 add_dst_reg(ctx, instr, &tmp_dst, 0); 1786 add_src_reg(ctx, instr, src, src->SwizzleX); 1787 add_src_reg(ctx, instr, &constval, constval.SwizzleX); 1788 instr->cat2.condition = IR3_COND_NE; 1789 1790 compile_assert(ctx, instr->regs[1]->flags & IR3_REG_SSA); /* because get_unconst() */ 1791 cond = instr->regs[1]->instr; 1792 1793 /* meta:flow tmp0 */ 1794 instr = instr_create(ctx, -1, OPC_META_FLOW); 1795 ir3_reg_create(instr, 0, 0); /* dummy dst */ 1796 add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X); 1797 1798 push_branch(ctx, false, instr, cond); 1799 instr->flow.if_block = push_block(ctx); 1800} 1801 1802static void 1803trans_else(const struct instr_translater *t, 1804 struct ir3_compile_context *ctx, 1805 struct tgsi_full_instruction *inst) 1806{ 1807 struct ir3_instruction *instr; 1808 1809 pop_block(ctx); 1810 1811 instr = pop_branch(ctx); 1812 1813 compile_assert(ctx, (instr->category == -1) && 1814 (instr->opc == OPC_META_FLOW)); 1815 1816 push_branch(ctx, true, instr, NULL); 1817 instr->flow.else_block = push_block(ctx); 1818} 1819 1820static struct ir3_instruction * 1821find_temporary(struct ir3_block *block, unsigned n) 1822{ 1823 if (block->parent && !block->temporaries[n]) 1824 return find_temporary(block->parent, n); 1825 return block->temporaries[n]; 1826} 1827 1828static struct ir3_instruction * 1829find_output(struct ir3_block *block, unsigned n) 1830{ 1831 if (block->parent && !block->outputs[n]) 1832 return find_output(block->parent, n); 1833 return block->outputs[n]; 1834} 1835 1836static struct ir3_instruction * 1837create_phi(struct ir3_compile_context *ctx, struct ir3_instruction *cond, 1838 struct ir3_instruction *a, struct ir3_instruction *b) 1839{ 1840 struct ir3_instruction *phi; 1841 1842 compile_assert(ctx, cond); 1843 1844 /* Either side of the condition could be null.. which 1845 * indicates a variable written on only one side of the 1846 * branch. Normally this should only be variables not 1847 * used outside of that side of the branch. So we could 1848 * just 'return a ? a : b;' in that case. But for better 1849 * defined undefined behavior we just stick in imm{0.0}. 1850 * In the common case of a value only used within the 1851 * one side of the branch, the PHI instruction will not 1852 * get scheduled 1853 */ 1854 if (!a) 1855 a = create_immed(ctx, 0.0); 1856 if (!b) 1857 b = create_immed(ctx, 0.0); 1858 1859 phi = instr_create(ctx, -1, OPC_META_PHI); 1860 ir3_reg_create(phi, 0, 0); /* dummy dst */ 1861 ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = cond; 1862 ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = a; 1863 ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = b; 1864 1865 return phi; 1866} 1867 1868static void 1869trans_endif(const struct instr_translater *t, 1870 struct ir3_compile_context *ctx, 1871 struct tgsi_full_instruction *inst) 1872{ 1873 struct ir3_instruction *instr; 1874 struct ir3_block *ifb, *elseb; 1875 struct ir3_instruction **ifout, **elseout; 1876 unsigned i, ifnout = 0, elsenout = 0; 1877 1878 pop_block(ctx); 1879 1880 instr = pop_branch(ctx); 1881 1882 compile_assert(ctx, (instr->category == -1) && 1883 (instr->opc == OPC_META_FLOW)); 1884 1885 ifb = instr->flow.if_block; 1886 elseb = instr->flow.else_block; 1887 /* if there is no else block, the parent block is used for the 1888 * branch-not-taken src of the PHI instructions: 1889 */ 1890 if (!elseb) 1891 elseb = ifb->parent; 1892 1893 /* worst case sizes: */ 1894 ifnout = ifb->ntemporaries + ifb->noutputs; 1895 elsenout = elseb->ntemporaries + elseb->noutputs; 1896 1897 ifout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * ifnout); 1898 if (elseb != ifb->parent) 1899 elseout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * elsenout); 1900 1901 ifnout = 0; 1902 elsenout = 0; 1903 1904 /* generate PHI instructions for any temporaries written: */ 1905 for (i = 0; i < ifb->ntemporaries; i++) { 1906 struct ir3_instruction *a = ifb->temporaries[i]; 1907 struct ir3_instruction *b = elseb->temporaries[i]; 1908 1909 /* if temporary written in if-block, or if else block 1910 * is present and temporary written in else-block: 1911 */ 1912 if (a || ((elseb != ifb->parent) && b)) { 1913 struct ir3_instruction *phi; 1914 1915 /* if only written on one side, find the closest 1916 * enclosing update on other side: 1917 */ 1918 if (!a) 1919 a = find_temporary(ifb, i); 1920 if (!b) 1921 b = find_temporary(elseb, i); 1922 1923 ifout[ifnout] = a; 1924 a = create_output(ifb, a, ifnout++); 1925 1926 if (elseb != ifb->parent) { 1927 elseout[elsenout] = b; 1928 b = create_output(elseb, b, elsenout++); 1929 } 1930 1931 phi = create_phi(ctx, instr, a, b); 1932 ctx->block->temporaries[i] = phi; 1933 } 1934 } 1935 1936 compile_assert(ctx, ifb->noutputs == elseb->noutputs); 1937 1938 /* .. and any outputs written: */ 1939 for (i = 0; i < ifb->noutputs; i++) { 1940 struct ir3_instruction *a = ifb->outputs[i]; 1941 struct ir3_instruction *b = elseb->outputs[i]; 1942 1943 /* if output written in if-block, or if else block 1944 * is present and output written in else-block: 1945 */ 1946 if (a || ((elseb != ifb->parent) && b)) { 1947 struct ir3_instruction *phi; 1948 1949 /* if only written on one side, find the closest 1950 * enclosing update on other side: 1951 */ 1952 if (!a) 1953 a = find_output(ifb, i); 1954 if (!b) 1955 b = find_output(elseb, i); 1956 1957 ifout[ifnout] = a; 1958 a = create_output(ifb, a, ifnout++); 1959 1960 if (elseb != ifb->parent) { 1961 elseout[elsenout] = b; 1962 b = create_output(elseb, b, elsenout++); 1963 } 1964 1965 phi = create_phi(ctx, instr, a, b); 1966 ctx->block->outputs[i] = phi; 1967 } 1968 } 1969 1970 ifb->noutputs = ifnout; 1971 ifb->outputs = ifout; 1972 1973 if (elseb != ifb->parent) { 1974 elseb->noutputs = elsenout; 1975 elseb->outputs = elseout; 1976 } 1977 1978 // TODO maybe we want to compact block->inputs? 1979} 1980 1981/* 1982 * Kill 1983 */ 1984 1985static void 1986trans_kill(const struct instr_translater *t, 1987 struct ir3_compile_context *ctx, 1988 struct tgsi_full_instruction *inst) 1989{ 1990 struct ir3_instruction *instr, *immed, *cond = NULL; 1991 bool inv = false; 1992 1993 switch (t->tgsi_opc) { 1994 case TGSI_OPCODE_KILL: 1995 /* unconditional kill, use enclosing if condition: */ 1996 if (ctx->branch_count > 0) { 1997 unsigned int idx = ctx->branch_count - 1; 1998 cond = ctx->branch[idx].cond; 1999 inv = ctx->branch[idx].inv; 2000 } else { 2001 cond = create_immed(ctx, 1.0); 2002 } 2003 2004 break; 2005 } 2006 2007 compile_assert(ctx, cond); 2008 2009 immed = create_immed(ctx, 0.0); 2010 2011 /* cmps.f.ne p0.x, cond, {0.0} */ 2012 instr = instr_create(ctx, 2, OPC_CMPS_F); 2013 instr->cat2.condition = IR3_COND_NE; 2014 ir3_reg_create(instr, regid(REG_P0, 0), 0); 2015 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond; 2016 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed; 2017 cond = instr; 2018 2019 /* kill p0.x */ 2020 instr = instr_create(ctx, 0, OPC_KILL); 2021 instr->cat0.inv = inv; 2022 ir3_reg_create(instr, 0, 0); /* dummy dst */ 2023 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond; 2024 2025 ctx->kill[ctx->kill_count++] = instr; 2026 2027 ctx->so->has_kill = true; 2028} 2029 2030/* 2031 * Kill-If 2032 */ 2033 2034static void 2035trans_killif(const struct instr_translater *t, 2036 struct ir3_compile_context *ctx, 2037 struct tgsi_full_instruction *inst) 2038{ 2039 struct tgsi_src_register *src = &inst->Src[0].Register; 2040 struct ir3_instruction *instr, *immed, *cond = NULL; 2041 bool inv = false; 2042 2043 immed = create_immed(ctx, 0.0); 2044 2045 /* cmps.f.ne p0.x, cond, {0.0} */ 2046 instr = instr_create(ctx, 2, OPC_CMPS_F); 2047 instr->cat2.condition = IR3_COND_NE; 2048 ir3_reg_create(instr, regid(REG_P0, 0), 0); 2049 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed; 2050 add_src_reg(ctx, instr, src, src->SwizzleX); 2051 2052 cond = instr; 2053 2054 /* kill p0.x */ 2055 instr = instr_create(ctx, 0, OPC_KILL); 2056 instr->cat0.inv = inv; 2057 ir3_reg_create(instr, 0, 0); /* dummy dst */ 2058 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond; 2059 2060 ctx->kill[ctx->kill_count++] = instr; 2061 2062 ctx->so->has_kill = true; 2063 2064} 2065/* 2066 * I2F / U2F / F2I / F2U 2067 */ 2068 2069static void 2070trans_cov(const struct instr_translater *t, 2071 struct ir3_compile_context *ctx, 2072 struct tgsi_full_instruction *inst) 2073{ 2074 struct ir3_instruction *instr; 2075 struct tgsi_dst_register *dst = get_dst(ctx, inst); 2076 struct tgsi_src_register *src = &inst->Src[0].Register; 2077 2078 // cov.f32s32 dst, tmp0 / 2079 instr = instr_create(ctx, 1, 0); 2080 switch (t->tgsi_opc) { 2081 case TGSI_OPCODE_U2F: 2082 instr->cat1.src_type = TYPE_U32; 2083 instr->cat1.dst_type = TYPE_F32; 2084 break; 2085 case TGSI_OPCODE_I2F: 2086 instr->cat1.src_type = TYPE_S32; 2087 instr->cat1.dst_type = TYPE_F32; 2088 break; 2089 case TGSI_OPCODE_F2U: 2090 instr->cat1.src_type = TYPE_F32; 2091 instr->cat1.dst_type = TYPE_U32; 2092 break; 2093 case TGSI_OPCODE_F2I: 2094 instr->cat1.src_type = TYPE_F32; 2095 instr->cat1.dst_type = TYPE_S32; 2096 break; 2097 2098 } 2099 vectorize(ctx, instr, dst, 1, src, 0); 2100 put_dst(ctx, inst, dst); 2101} 2102 2103/* 2104 * UMUL / UMAD 2105 * 2106 * There is no 32-bit multiply instruction, so splitting a and b into high and 2107 * low components, we get that 2108 * 2109 * dst = al * bl + ah * bl << 16 + al * bh << 16 2110 * 2111 * mull.u tmp0, a, b (mul low, i.e. al * bl) 2112 * madsh.m16 tmp1, a, b, tmp0 (mul-add shift high mix, i.e. ah * bl << 16) 2113 * madsh.m16 dst, b, a, tmp1 (i.e. al * bh << 16) 2114 * 2115 * For UMAD, add in the extra argument after mull.u. 2116 */ 2117static void 2118trans_umul(const struct instr_translater *t, 2119 struct ir3_compile_context *ctx, 2120 struct tgsi_full_instruction *inst) 2121{ 2122 struct ir3_instruction *instr; 2123 struct tgsi_dst_register *dst = get_dst(ctx, inst); 2124 struct tgsi_src_register *a = &inst->Src[0].Register; 2125 struct tgsi_src_register *b = &inst->Src[1].Register; 2126 2127 struct tgsi_dst_register tmp0_dst, tmp1_dst; 2128 struct tgsi_src_register *tmp0_src, *tmp1_src; 2129 2130 tmp0_src = get_internal_temp(ctx, &tmp0_dst); 2131 tmp1_src = get_internal_temp(ctx, &tmp1_dst); 2132 2133 if (is_rel_or_const(a)) 2134 a = get_unconst(ctx, a); 2135 if (is_rel_or_const(b)) 2136 b = get_unconst(ctx, b); 2137 2138 /* mull.u tmp0, a, b */ 2139 instr = instr_create(ctx, 2, OPC_MULL_U); 2140 vectorize(ctx, instr, &tmp0_dst, 2, a, 0, b, 0); 2141 2142 if (t->tgsi_opc == TGSI_OPCODE_UMAD) { 2143 struct tgsi_src_register *c = &inst->Src[2].Register; 2144 2145 /* add.u tmp0, tmp0, c */ 2146 instr = instr_create(ctx, 2, OPC_ADD_U); 2147 vectorize(ctx, instr, &tmp0_dst, 2, tmp0_src, 0, c, 0); 2148 } 2149 2150 /* madsh.m16 tmp1, a, b, tmp0 */ 2151 instr = instr_create(ctx, 3, OPC_MADSH_M16); 2152 vectorize(ctx, instr, &tmp1_dst, 3, a, 0, b, 0, tmp0_src, 0); 2153 2154 /* madsh.m16 dst, b, a, tmp1 */ 2155 instr = instr_create(ctx, 3, OPC_MADSH_M16); 2156 vectorize(ctx, instr, dst, 3, b, 0, a, 0, tmp1_src, 0); 2157 put_dst(ctx, inst, dst); 2158} 2159 2160/* 2161 * IDIV / UDIV / MOD / UMOD 2162 * 2163 * See NV50LegalizeSSA::handleDIV for the origin of this implementation. For 2164 * MOD/UMOD, it becomes a - [IU]DIV(a, modulus) * modulus. 2165 */ 2166static void 2167trans_idiv(const struct instr_translater *t, 2168 struct ir3_compile_context *ctx, 2169 struct tgsi_full_instruction *inst) 2170{ 2171 struct ir3_instruction *instr; 2172 struct tgsi_dst_register *dst = get_dst(ctx, inst), *premod_dst = dst; 2173 struct tgsi_src_register *a = &inst->Src[0].Register; 2174 struct tgsi_src_register *b = &inst->Src[1].Register; 2175 2176 struct tgsi_dst_register af_dst, bf_dst, q_dst, r_dst, a_dst, b_dst; 2177 struct tgsi_src_register *af_src, *bf_src, *q_src, *r_src, *a_src, *b_src; 2178 2179 struct tgsi_src_register negative_2, thirty_one; 2180 type_t src_type; 2181 2182 if (t->tgsi_opc == TGSI_OPCODE_IDIV || t->tgsi_opc == TGSI_OPCODE_MOD) 2183 src_type = get_stype(ctx); 2184 else 2185 src_type = get_utype(ctx); 2186 2187 af_src = get_internal_temp(ctx, &af_dst); 2188 bf_src = get_internal_temp(ctx, &bf_dst); 2189 q_src = get_internal_temp(ctx, &q_dst); 2190 r_src = get_internal_temp(ctx, &r_dst); 2191 a_src = get_internal_temp(ctx, &a_dst); 2192 b_src = get_internal_temp(ctx, &b_dst); 2193 2194 get_immediate(ctx, &negative_2, -2); 2195 get_immediate(ctx, &thirty_one, 31); 2196 2197 if (t->tgsi_opc == TGSI_OPCODE_MOD || t->tgsi_opc == TGSI_OPCODE_UMOD) 2198 premod_dst = &q_dst; 2199 2200 /* cov.[us]32f32 af, numerator */ 2201 instr = instr_create(ctx, 1, 0); 2202 instr->cat1.src_type = src_type; 2203 instr->cat1.dst_type = get_ftype(ctx); 2204 vectorize(ctx, instr, &af_dst, 1, a, 0); 2205 2206 /* cov.[us]32f32 bf, denominator */ 2207 instr = instr_create(ctx, 1, 0); 2208 instr->cat1.src_type = src_type; 2209 instr->cat1.dst_type = get_ftype(ctx); 2210 vectorize(ctx, instr, &bf_dst, 1, b, 0); 2211 2212 /* Get the absolute values for IDIV */ 2213 if (type_sint(src_type)) { 2214 /* absneg.f af, (abs)af */ 2215 instr = instr_create(ctx, 2, OPC_ABSNEG_F); 2216 vectorize(ctx, instr, &af_dst, 1, af_src, IR3_REG_ABS); 2217 2218 /* absneg.f bf, (abs)bf */ 2219 instr = instr_create(ctx, 2, OPC_ABSNEG_F); 2220 vectorize(ctx, instr, &bf_dst, 1, bf_src, IR3_REG_ABS); 2221 2222 /* absneg.s a, (abs)numerator */ 2223 instr = instr_create(ctx, 2, OPC_ABSNEG_S); 2224 vectorize(ctx, instr, &a_dst, 1, a, IR3_REG_ABS); 2225 2226 /* absneg.s b, (abs)denominator */ 2227 instr = instr_create(ctx, 2, OPC_ABSNEG_S); 2228 vectorize(ctx, instr, &b_dst, 1, b, IR3_REG_ABS); 2229 } else { 2230 /* mov.u32u32 a, numerator */ 2231 instr = instr_create(ctx, 1, 0); 2232 instr->cat1.src_type = src_type; 2233 instr->cat1.dst_type = src_type; 2234 vectorize(ctx, instr, &a_dst, 1, a, 0); 2235 2236 /* mov.u32u32 b, denominator */ 2237 instr = instr_create(ctx, 1, 0); 2238 instr->cat1.src_type = src_type; 2239 instr->cat1.dst_type = src_type; 2240 vectorize(ctx, instr, &b_dst, 1, b, 0); 2241 } 2242 2243 /* rcp.f bf, bf */ 2244 instr = instr_create(ctx, 4, OPC_RCP); 2245 vectorize(ctx, instr, &bf_dst, 1, bf_src, 0); 2246 2247 /* That's right, subtract 2 as an integer from the float */ 2248 /* add.u bf, bf, -2 */ 2249 instr = instr_create(ctx, 2, OPC_ADD_U); 2250 vectorize(ctx, instr, &bf_dst, 2, bf_src, 0, &negative_2, 0); 2251 2252 /* mul.f q, af, bf */ 2253 instr = instr_create(ctx, 2, OPC_MUL_F); 2254 vectorize(ctx, instr, &q_dst, 2, af_src, 0, bf_src, 0); 2255 2256 /* cov.f32[us]32 q, q */ 2257 instr = instr_create(ctx, 1, 0); 2258 instr->cat1.src_type = get_ftype(ctx); 2259 instr->cat1.dst_type = src_type; 2260 vectorize(ctx, instr, &q_dst, 1, q_src, 0); 2261 2262 /* integer multiply q by b */ 2263 /* mull.u r, q, b */ 2264 instr = instr_create(ctx, 2, OPC_MULL_U); 2265 vectorize(ctx, instr, &r_dst, 2, q_src, 0, b_src, 0); 2266 2267 /* madsh.m16 r, q, b, r */ 2268 instr = instr_create(ctx, 3, OPC_MADSH_M16); 2269 vectorize(ctx, instr, &r_dst, 3, q_src, 0, b_src, 0, r_src, 0); 2270 2271 /* madsh.m16, r, b, q, r */ 2272 instr = instr_create(ctx, 3, OPC_MADSH_M16); 2273 vectorize(ctx, instr, &r_dst, 3, b_src, 0, q_src, 0, r_src, 0); 2274 2275 /* sub.u r, a, r */ 2276 instr = instr_create(ctx, 2, OPC_SUB_U); 2277 vectorize(ctx, instr, &r_dst, 2, a_src, 0, r_src, 0); 2278 2279 /* cov.u32f32, r, r */ 2280 instr = instr_create(ctx, 1, 0); 2281 instr->cat1.src_type = get_utype(ctx); 2282 instr->cat1.dst_type = get_ftype(ctx); 2283 vectorize(ctx, instr, &r_dst, 1, r_src, 0); 2284 2285 /* mul.f r, r, bf */ 2286 instr = instr_create(ctx, 2, OPC_MUL_F); 2287 vectorize(ctx, instr, &r_dst, 2, r_src, 0, bf_src, 0); 2288 2289 /* cov.f32u32 r, r */ 2290 instr = instr_create(ctx, 1, 0); 2291 instr->cat1.src_type = get_ftype(ctx); 2292 instr->cat1.dst_type = get_utype(ctx); 2293 vectorize(ctx, instr, &r_dst, 1, r_src, 0); 2294 2295 /* add.u q, q, r */ 2296 instr = instr_create(ctx, 2, OPC_ADD_U); 2297 vectorize(ctx, instr, &q_dst, 2, q_src, 0, r_src, 0); 2298 2299 /* mull.u r, q, b */ 2300 instr = instr_create(ctx, 2, OPC_MULL_U); 2301 vectorize(ctx, instr, &r_dst, 2, q_src, 0, b_src, 0); 2302 2303 /* madsh.m16 r, q, b, r */ 2304 instr = instr_create(ctx, 3, OPC_MADSH_M16); 2305 vectorize(ctx, instr, &r_dst, 3, q_src, 0, b_src, 0, r_src, 0); 2306 2307 /* madsh.m16 r, b, q, r */ 2308 instr = instr_create(ctx, 3, OPC_MADSH_M16); 2309 vectorize(ctx, instr, &r_dst, 3, b_src, 0, q_src, 0, r_src, 0); 2310 2311 /* sub.u r, a, r */ 2312 instr = instr_create(ctx, 2, OPC_SUB_U); 2313 vectorize(ctx, instr, &r_dst, 2, a_src, 0, r_src, 0); 2314 2315 /* cmps.u.ge r, r, b */ 2316 instr = instr_create(ctx, 2, OPC_CMPS_U); 2317 instr->cat2.condition = IR3_COND_GE; 2318 vectorize(ctx, instr, &r_dst, 2, r_src, 0, b_src, 0); 2319 2320 if (type_uint(src_type)) { 2321 /* add.u dst, q, r */ 2322 instr = instr_create(ctx, 2, OPC_ADD_U); 2323 vectorize(ctx, instr, premod_dst, 2, q_src, 0, r_src, 0); 2324 } else { 2325 /* add.u q, q, r */ 2326 instr = instr_create(ctx, 2, OPC_ADD_U); 2327 vectorize(ctx, instr, &q_dst, 2, q_src, 0, r_src, 0); 2328 2329 /* negate result based on the original arguments */ 2330 if (is_const(a) && is_const(b)) 2331 a = get_unconst(ctx, a); 2332 2333 /* xor.b r, numerator, denominator */ 2334 instr = instr_create(ctx, 2, OPC_XOR_B); 2335 vectorize(ctx, instr, &r_dst, 2, a, 0, b, 0); 2336 2337 /* shr.b r, r, 31 */ 2338 instr = instr_create(ctx, 2, OPC_SHR_B); 2339 vectorize(ctx, instr, &r_dst, 2, r_src, 0, &thirty_one, 0); 2340 2341 /* absneg.s b, (neg)q */ 2342 instr = instr_create(ctx, 2, OPC_ABSNEG_S); 2343 vectorize(ctx, instr, &b_dst, 1, q_src, IR3_REG_NEGATE); 2344 2345 /* sel.b dst, b, r, q */ 2346 instr = instr_create(ctx, 3, OPC_SEL_B32); 2347 vectorize(ctx, instr, premod_dst, 3, b_src, 0, r_src, 0, q_src, 0); 2348 } 2349 2350 if (t->tgsi_opc == TGSI_OPCODE_MOD || t->tgsi_opc == TGSI_OPCODE_UMOD) { 2351 /* The division result will have ended up in q. */ 2352 2353 if (is_rel_or_const(b)) 2354 b = get_unconst(ctx, b); 2355 2356 /* mull.u r, q, b */ 2357 instr = instr_create(ctx, 2, OPC_MULL_U); 2358 vectorize(ctx, instr, &r_dst, 2, q_src, 0, b, 0); 2359 2360 /* madsh.m16 r, q, b, r */ 2361 instr = instr_create(ctx, 3, OPC_MADSH_M16); 2362 vectorize(ctx, instr, &r_dst, 3, q_src, 0, b, 0, r_src, 0); 2363 2364 /* madsh.m16 r, b, q, r */ 2365 instr = instr_create(ctx, 3, OPC_MADSH_M16); 2366 vectorize(ctx, instr, &r_dst, 3, b, 0, q_src, 0, r_src, 0); 2367 2368 /* sub.u dst, a, r */ 2369 instr = instr_create(ctx, 2, OPC_SUB_U); 2370 vectorize(ctx, instr, dst, 2, a, 0, r_src, 0); 2371 } 2372 2373 put_dst(ctx, inst, dst); 2374} 2375 2376/* 2377 * Handlers for TGSI instructions which do have 1:1 mapping to native 2378 * instructions: 2379 */ 2380 2381static void 2382instr_cat0(const struct instr_translater *t, 2383 struct ir3_compile_context *ctx, 2384 struct tgsi_full_instruction *inst) 2385{ 2386 instr_create(ctx, 0, t->opc); 2387} 2388 2389static void 2390instr_cat1(const struct instr_translater *t, 2391 struct ir3_compile_context *ctx, 2392 struct tgsi_full_instruction *inst) 2393{ 2394 struct tgsi_dst_register *dst = get_dst(ctx, inst); 2395 struct tgsi_src_register *src = &inst->Src[0].Register; 2396 create_mov(ctx, dst, src); 2397 put_dst(ctx, inst, dst); 2398} 2399 2400static void 2401instr_cat2(const struct instr_translater *t, 2402 struct ir3_compile_context *ctx, 2403 struct tgsi_full_instruction *inst) 2404{ 2405 struct tgsi_dst_register *dst = get_dst(ctx, inst); 2406 struct tgsi_src_register *src0 = &inst->Src[0].Register; 2407 struct tgsi_src_register *src1 = &inst->Src[1].Register; 2408 struct ir3_instruction *instr; 2409 unsigned src0_flags = 0, src1_flags = 0; 2410 2411 switch (t->tgsi_opc) { 2412 case TGSI_OPCODE_ABS: 2413 case TGSI_OPCODE_IABS: 2414 src0_flags = IR3_REG_ABS; 2415 break; 2416 case TGSI_OPCODE_INEG: 2417 src0_flags = IR3_REG_NEGATE; 2418 break; 2419 case TGSI_OPCODE_SUB: 2420 src1_flags = IR3_REG_NEGATE; 2421 break; 2422 } 2423 2424 switch (t->opc) { 2425 case OPC_ABSNEG_F: 2426 case OPC_ABSNEG_S: 2427 case OPC_CLZ_B: 2428 case OPC_CLZ_S: 2429 case OPC_SIGN_F: 2430 case OPC_FLOOR_F: 2431 case OPC_CEIL_F: 2432 case OPC_RNDNE_F: 2433 case OPC_RNDAZ_F: 2434 case OPC_TRUNC_F: 2435 case OPC_NOT_B: 2436 case OPC_BFREV_B: 2437 case OPC_SETRM: 2438 case OPC_CBITS_B: 2439 /* these only have one src reg */ 2440 instr = instr_create(ctx, 2, t->opc); 2441 vectorize(ctx, instr, dst, 1, src0, src0_flags); 2442 break; 2443 default: 2444 if (is_const(src0) && is_const(src1)) 2445 src0 = get_unconst(ctx, src0); 2446 2447 instr = instr_create(ctx, 2, t->opc); 2448 vectorize(ctx, instr, dst, 2, src0, src0_flags, 2449 src1, src1_flags); 2450 break; 2451 } 2452 2453 put_dst(ctx, inst, dst); 2454} 2455 2456static void 2457instr_cat3(const struct instr_translater *t, 2458 struct ir3_compile_context *ctx, 2459 struct tgsi_full_instruction *inst) 2460{ 2461 struct tgsi_dst_register *dst = get_dst(ctx, inst); 2462 struct tgsi_src_register *src0 = &inst->Src[0].Register; 2463 struct tgsi_src_register *src1 = &inst->Src[1].Register; 2464 struct ir3_instruction *instr; 2465 2466 /* in particular, can't handle const for src1 for cat3.. 2467 * for mad, we can swap first two src's if needed: 2468 */ 2469 if (is_rel_or_const(src1)) { 2470 if (is_mad(t->opc) && !is_rel_or_const(src0)) { 2471 struct tgsi_src_register *tmp; 2472 tmp = src0; 2473 src0 = src1; 2474 src1 = tmp; 2475 } else { 2476 src1 = get_unconst(ctx, src1); 2477 } 2478 } 2479 2480 instr = instr_create(ctx, 3, t->opc); 2481 vectorize(ctx, instr, dst, 3, src0, 0, src1, 0, 2482 &inst->Src[2].Register, 0); 2483 put_dst(ctx, inst, dst); 2484} 2485 2486static void 2487instr_cat4(const struct instr_translater *t, 2488 struct ir3_compile_context *ctx, 2489 struct tgsi_full_instruction *inst) 2490{ 2491 struct tgsi_dst_register *dst = get_dst(ctx, inst); 2492 struct tgsi_src_register *src = &inst->Src[0].Register; 2493 struct ir3_instruction *instr; 2494 unsigned i; 2495 2496 /* seems like blob compiler avoids const as src.. */ 2497 if (is_const(src)) 2498 src = get_unconst(ctx, src); 2499 2500 /* we need to replicate into each component: */ 2501 for (i = 0; i < 4; i++) { 2502 if (dst->WriteMask & (1 << i)) { 2503 instr = instr_create(ctx, 4, t->opc); 2504 add_dst_reg(ctx, instr, dst, i); 2505 add_src_reg(ctx, instr, src, src->SwizzleX); 2506 } 2507 } 2508 2509 put_dst(ctx, inst, dst); 2510} 2511 2512static const struct instr_translater translaters[TGSI_OPCODE_LAST] = { 2513#define INSTR(n, f, ...) \ 2514 [TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ } 2515 2516 INSTR(MOV, instr_cat1), 2517 INSTR(RCP, instr_cat4, .opc = OPC_RCP), 2518 INSTR(RSQ, instr_cat4, .opc = OPC_RSQ), 2519 INSTR(SQRT, instr_cat4, .opc = OPC_SQRT), 2520 INSTR(MUL, instr_cat2, .opc = OPC_MUL_F), 2521 INSTR(ADD, instr_cat2, .opc = OPC_ADD_F), 2522 INSTR(SUB, instr_cat2, .opc = OPC_ADD_F), 2523 INSTR(MIN, instr_cat2, .opc = OPC_MIN_F), 2524 INSTR(MAX, instr_cat2, .opc = OPC_MAX_F), 2525 INSTR(UADD, instr_cat2, .opc = OPC_ADD_U), 2526 INSTR(IMIN, instr_cat2, .opc = OPC_MIN_S), 2527 INSTR(UMIN, instr_cat2, .opc = OPC_MIN_U), 2528 INSTR(IMAX, instr_cat2, .opc = OPC_MAX_S), 2529 INSTR(UMAX, instr_cat2, .opc = OPC_MAX_U), 2530 INSTR(AND, instr_cat2, .opc = OPC_AND_B), 2531 INSTR(OR, instr_cat2, .opc = OPC_OR_B), 2532 INSTR(NOT, instr_cat2, .opc = OPC_NOT_B), 2533 INSTR(XOR, instr_cat2, .opc = OPC_XOR_B), 2534 INSTR(UMUL, trans_umul), 2535 INSTR(UMAD, trans_umul), 2536 INSTR(UDIV, trans_idiv), 2537 INSTR(IDIV, trans_idiv), 2538 INSTR(MOD, trans_idiv), 2539 INSTR(UMOD, trans_idiv), 2540 INSTR(SHL, instr_cat2, .opc = OPC_SHL_B), 2541 INSTR(USHR, instr_cat2, .opc = OPC_SHR_B), 2542 INSTR(ISHR, instr_cat2, .opc = OPC_ASHR_B), 2543 INSTR(IABS, instr_cat2, .opc = OPC_ABSNEG_S), 2544 INSTR(INEG, instr_cat2, .opc = OPC_ABSNEG_S), 2545 INSTR(AND, instr_cat2, .opc = OPC_AND_B), 2546 INSTR(MAD, instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16), 2547 INSTR(TRUNC, instr_cat2, .opc = OPC_TRUNC_F), 2548 INSTR(CLAMP, trans_clamp), 2549 INSTR(FLR, instr_cat2, .opc = OPC_FLOOR_F), 2550 INSTR(ROUND, instr_cat2, .opc = OPC_RNDNE_F), 2551 INSTR(SSG, instr_cat2, .opc = OPC_SIGN_F), 2552 INSTR(CEIL, instr_cat2, .opc = OPC_CEIL_F), 2553 INSTR(ARL, trans_arl), 2554 INSTR(UARL, trans_arl), 2555 INSTR(EX2, instr_cat4, .opc = OPC_EXP2), 2556 INSTR(LG2, instr_cat4, .opc = OPC_LOG2), 2557 INSTR(ABS, instr_cat2, .opc = OPC_ABSNEG_F), 2558 INSTR(COS, instr_cat4, .opc = OPC_COS), 2559 INSTR(SIN, instr_cat4, .opc = OPC_SIN), 2560 INSTR(TEX, trans_samp, .opc = OPC_SAM), 2561 INSTR(TXP, trans_samp, .opc = OPC_SAM), 2562 INSTR(TXB, trans_samp, .opc = OPC_SAMB), 2563 INSTR(TXB2, trans_samp, .opc = OPC_SAMB), 2564 INSTR(TXL, trans_samp, .opc = OPC_SAML), 2565 INSTR(TXD, trans_samp, .opc = OPC_SAMGQ), 2566 INSTR(TXF, trans_samp, .opc = OPC_ISAML), 2567 INSTR(TXQ, trans_txq), 2568 INSTR(DDX, trans_deriv, .opc = OPC_DSX), 2569 INSTR(DDY, trans_deriv, .opc = OPC_DSY), 2570 INSTR(SGT, trans_cmp), 2571 INSTR(SLT, trans_cmp), 2572 INSTR(FSLT, trans_cmp), 2573 INSTR(SGE, trans_cmp), 2574 INSTR(FSGE, trans_cmp), 2575 INSTR(SLE, trans_cmp), 2576 INSTR(SNE, trans_cmp), 2577 INSTR(FSNE, trans_cmp), 2578 INSTR(SEQ, trans_cmp), 2579 INSTR(FSEQ, trans_cmp), 2580 INSTR(CMP, trans_cmp), 2581 INSTR(USNE, trans_icmp, .opc = OPC_CMPS_U), 2582 INSTR(USEQ, trans_icmp, .opc = OPC_CMPS_U), 2583 INSTR(ISGE, trans_icmp, .opc = OPC_CMPS_S), 2584 INSTR(USGE, trans_icmp, .opc = OPC_CMPS_U), 2585 INSTR(ISLT, trans_icmp, .opc = OPC_CMPS_S), 2586 INSTR(USLT, trans_icmp, .opc = OPC_CMPS_U), 2587 INSTR(UCMP, trans_ucmp), 2588 INSTR(ISSG, trans_issg), 2589 INSTR(IF, trans_if, .opc = OPC_CMPS_F), 2590 INSTR(UIF, trans_if, .opc = OPC_CMPS_U), 2591 INSTR(ELSE, trans_else), 2592 INSTR(ENDIF, trans_endif), 2593 INSTR(END, instr_cat0, .opc = OPC_END), 2594 INSTR(KILL, trans_kill, .opc = OPC_KILL), 2595 INSTR(KILL_IF, trans_killif, .opc = OPC_KILL), 2596 INSTR(I2F, trans_cov), 2597 INSTR(U2F, trans_cov), 2598 INSTR(F2I, trans_cov), 2599 INSTR(F2U, trans_cov), 2600}; 2601 2602static ir3_semantic 2603decl_semantic(const struct tgsi_declaration_semantic *sem) 2604{ 2605 return ir3_semantic_name(sem->Name, sem->Index); 2606} 2607 2608static struct ir3_instruction * 2609decl_in_frag_bary(struct ir3_compile_context *ctx, unsigned regid, 2610 unsigned j, unsigned inloc) 2611{ 2612 struct ir3_instruction *instr; 2613 struct ir3_register *src; 2614 2615 /* bary.f dst, #inloc, r0.x */ 2616 instr = instr_create(ctx, 2, OPC_BARY_F); 2617 ir3_reg_create(instr, regid, 0); /* dummy dst */ 2618 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = inloc; 2619 src = ir3_reg_create(instr, 0, IR3_REG_SSA); 2620 src->wrmask = 0x3; 2621 src->instr = ctx->frag_pos; 2622 2623 return instr; 2624} 2625 2626/* TGSI_SEMANTIC_POSITION 2627 * """""""""""""""""""""" 2628 * 2629 * For fragment shaders, TGSI_SEMANTIC_POSITION is used to indicate that 2630 * fragment shader input contains the fragment's window position. The X 2631 * component starts at zero and always increases from left to right. 2632 * The Y component starts at zero and always increases but Y=0 may either 2633 * indicate the top of the window or the bottom depending on the fragment 2634 * coordinate origin convention (see TGSI_PROPERTY_FS_COORD_ORIGIN). 2635 * The Z coordinate ranges from 0 to 1 to represent depth from the front 2636 * to the back of the Z buffer. The W component contains the reciprocol 2637 * of the interpolated vertex position W component. 2638 */ 2639static struct ir3_instruction * 2640decl_in_frag_coord(struct ir3_compile_context *ctx, unsigned regid, 2641 unsigned j) 2642{ 2643 struct ir3_instruction *instr, *src; 2644 2645 compile_assert(ctx, !ctx->frag_coord[j]); 2646 2647 ctx->frag_coord[j] = create_input(ctx->block, NULL, 0); 2648 2649 2650 switch (j) { 2651 case 0: /* .x */ 2652 case 1: /* .y */ 2653 /* for frag_coord, we get unsigned values.. we need 2654 * to subtract (integer) 8 and divide by 16 (right- 2655 * shift by 4) then convert to float: 2656 */ 2657 2658 /* add.s tmp, src, -8 */ 2659 instr = instr_create(ctx, 2, OPC_ADD_S); 2660 ir3_reg_create(instr, regid, 0); /* dummy dst */ 2661 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_coord[j]; 2662 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = -8; 2663 src = instr; 2664 2665 /* shr.b tmp, tmp, 4 */ 2666 instr = instr_create(ctx, 2, OPC_SHR_B); 2667 ir3_reg_create(instr, regid, 0); /* dummy dst */ 2668 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; 2669 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4; 2670 src = instr; 2671 2672 /* mov.u32f32 dst, tmp */ 2673 instr = instr_create(ctx, 1, 0); 2674 instr->cat1.src_type = TYPE_U32; 2675 instr->cat1.dst_type = TYPE_F32; 2676 ir3_reg_create(instr, regid, 0); /* dummy dst */ 2677 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; 2678 2679 break; 2680 case 2: /* .z */ 2681 case 3: /* .w */ 2682 /* seems that we can use these as-is: */ 2683 instr = ctx->frag_coord[j]; 2684 break; 2685 default: 2686 compile_error(ctx, "invalid channel\n"); 2687 instr = create_immed(ctx, 0.0); 2688 break; 2689 } 2690 2691 return instr; 2692} 2693 2694/* TGSI_SEMANTIC_FACE 2695 * """""""""""""""""" 2696 * 2697 * This label applies to fragment shader inputs only and indicates that 2698 * the register contains front/back-face information of the form (F, 0, 2699 * 0, 1). The first component will be positive when the fragment belongs 2700 * to a front-facing polygon, and negative when the fragment belongs to a 2701 * back-facing polygon. 2702 */ 2703static struct ir3_instruction * 2704decl_in_frag_face(struct ir3_compile_context *ctx, unsigned regid, 2705 unsigned j) 2706{ 2707 struct ir3_instruction *instr, *src; 2708 2709 switch (j) { 2710 case 0: /* .x */ 2711 compile_assert(ctx, !ctx->frag_face); 2712 2713 ctx->frag_face = create_input(ctx->block, NULL, 0); 2714 2715 /* for faceness, we always get -1 or 0 (int).. but TGSI expects 2716 * positive vs negative float.. and piglit further seems to 2717 * expect -1.0 or 1.0: 2718 * 2719 * mul.s tmp, hr0.x, 2 2720 * add.s tmp, tmp, 1 2721 * mov.s16f32, dst, tmp 2722 * 2723 */ 2724 2725 instr = instr_create(ctx, 2, OPC_MUL_S); 2726 ir3_reg_create(instr, regid, 0); /* dummy dst */ 2727 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_face; 2728 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2; 2729 src = instr; 2730 2731 instr = instr_create(ctx, 2, OPC_ADD_S); 2732 ir3_reg_create(instr, regid, 0); /* dummy dst */ 2733 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; 2734 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1; 2735 src = instr; 2736 2737 instr = instr_create(ctx, 1, 0); /* mov */ 2738 instr->cat1.src_type = TYPE_S32; 2739 instr->cat1.dst_type = TYPE_F32; 2740 ir3_reg_create(instr, regid, 0); /* dummy dst */ 2741 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src; 2742 2743 break; 2744 case 1: /* .y */ 2745 case 2: /* .z */ 2746 instr = create_immed(ctx, 0.0); 2747 break; 2748 case 3: /* .w */ 2749 instr = create_immed(ctx, 1.0); 2750 break; 2751 default: 2752 compile_error(ctx, "invalid channel\n"); 2753 instr = create_immed(ctx, 0.0); 2754 break; 2755 } 2756 2757 return instr; 2758} 2759 2760static void 2761decl_in(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl) 2762{ 2763 struct ir3_shader_variant *so = ctx->so; 2764 unsigned name = decl->Semantic.Name; 2765 unsigned i; 2766 2767 /* I don't think we should get frag shader input without 2768 * semantic info? Otherwise how do inputs get linked to 2769 * vert outputs? 2770 */ 2771 compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) || 2772 decl->Declaration.Semantic); 2773 2774 for (i = decl->Range.First; i <= decl->Range.Last; i++) { 2775 unsigned n = so->inputs_count++; 2776 unsigned r = regid(i, 0); 2777 unsigned ncomp, j; 2778 2779 /* we'll figure out the actual components used after scheduling */ 2780 ncomp = 4; 2781 2782 DBG("decl in -> r%d", i); 2783 2784 compile_assert(ctx, n < ARRAY_SIZE(so->inputs)); 2785 2786 so->inputs[n].semantic = decl_semantic(&decl->Semantic); 2787 so->inputs[n].compmask = (1 << ncomp) - 1; 2788 so->inputs[n].regid = r; 2789 so->inputs[n].inloc = ctx->next_inloc; 2790 so->inputs[n].interpolate = decl->Interp.Interpolate; 2791 2792 for (j = 0; j < ncomp; j++) { 2793 struct ir3_instruction *instr = NULL; 2794 2795 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { 2796 /* for fragment shaders, POSITION and FACE are handled 2797 * specially, not using normal varying / bary.f 2798 */ 2799 if (name == TGSI_SEMANTIC_POSITION) { 2800 so->inputs[n].bary = false; 2801 so->frag_coord = true; 2802 instr = decl_in_frag_coord(ctx, r + j, j); 2803 } else if (name == TGSI_SEMANTIC_FACE) { 2804 so->inputs[n].bary = false; 2805 so->frag_face = true; 2806 instr = decl_in_frag_face(ctx, r + j, j); 2807 } else { 2808 so->inputs[n].bary = true; 2809 instr = decl_in_frag_bary(ctx, r + j, j, 2810 so->inputs[n].inloc + j - 8); 2811 } 2812 } else { 2813 instr = create_input(ctx->block, NULL, (i * 4) + j); 2814 } 2815 2816 ctx->block->inputs[(i * 4) + j] = instr; 2817 } 2818 2819 if (so->inputs[n].bary || (ctx->type == TGSI_PROCESSOR_VERTEX)) { 2820 ctx->next_inloc += ncomp; 2821 so->total_in += ncomp; 2822 } 2823 } 2824} 2825 2826static void 2827decl_out(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl) 2828{ 2829 struct ir3_shader_variant *so = ctx->so; 2830 unsigned comp = 0; 2831 unsigned name = decl->Semantic.Name; 2832 unsigned i; 2833 2834 compile_assert(ctx, decl->Declaration.Semantic); 2835 2836 DBG("decl out[%d] -> r%d", name, decl->Range.First); 2837 2838 if (ctx->type == TGSI_PROCESSOR_VERTEX) { 2839 switch (name) { 2840 case TGSI_SEMANTIC_POSITION: 2841 so->writes_pos = true; 2842 break; 2843 case TGSI_SEMANTIC_PSIZE: 2844 so->writes_psize = true; 2845 break; 2846 case TGSI_SEMANTIC_COLOR: 2847 case TGSI_SEMANTIC_BCOLOR: 2848 case TGSI_SEMANTIC_GENERIC: 2849 case TGSI_SEMANTIC_FOG: 2850 case TGSI_SEMANTIC_TEXCOORD: 2851 break; 2852 default: 2853 compile_error(ctx, "unknown VS semantic name: %s\n", 2854 tgsi_semantic_names[name]); 2855 } 2856 } else { 2857 switch (name) { 2858 case TGSI_SEMANTIC_POSITION: 2859 comp = 2; /* tgsi will write to .z component */ 2860 so->writes_pos = true; 2861 break; 2862 case TGSI_SEMANTIC_COLOR: 2863 break; 2864 default: 2865 compile_error(ctx, "unknown FS semantic name: %s\n", 2866 tgsi_semantic_names[name]); 2867 } 2868 } 2869 2870 for (i = decl->Range.First; i <= decl->Range.Last; i++) { 2871 unsigned n = so->outputs_count++; 2872 unsigned ncomp, j; 2873 2874 ncomp = 4; 2875 2876 compile_assert(ctx, n < ARRAY_SIZE(so->outputs)); 2877 2878 so->outputs[n].semantic = decl_semantic(&decl->Semantic); 2879 so->outputs[n].regid = regid(i, comp); 2880 2881 /* avoid undefined outputs, stick a dummy mov from imm{0.0}, 2882 * which if the output is actually assigned will be over- 2883 * written 2884 */ 2885 for (j = 0; j < ncomp; j++) 2886 ctx->block->outputs[(i * 4) + j] = create_immed(ctx, 0.0); 2887 } 2888} 2889 2890/* from TGSI perspective, we actually have inputs. But most of the "inputs" 2891 * for a fragment shader are just bary.f instructions. The *actual* inputs 2892 * from the hw perspective are the frag_pos and optionally frag_coord and 2893 * frag_face. 2894 */ 2895static void 2896fixup_frag_inputs(struct ir3_compile_context *ctx) 2897{ 2898 struct ir3_shader_variant *so = ctx->so; 2899 struct ir3_block *block = ctx->block; 2900 struct ir3_instruction **inputs; 2901 struct ir3_instruction *instr; 2902 int n, regid = 0; 2903 2904 block->ninputs = 0; 2905 2906 n = 4; /* always have frag_pos */ 2907 n += COND(so->frag_face, 4); 2908 n += COND(so->frag_coord, 4); 2909 2910 inputs = ir3_alloc(ctx->ir, n * (sizeof(struct ir3_instruction *))); 2911 2912 if (so->frag_face) { 2913 /* this ultimately gets assigned to hr0.x so doesn't conflict 2914 * with frag_coord/frag_pos.. 2915 */ 2916 inputs[block->ninputs++] = ctx->frag_face; 2917 ctx->frag_face->regs[0]->num = 0; 2918 2919 /* remaining channels not used, but let's avoid confusing 2920 * other parts that expect inputs to come in groups of vec4 2921 */ 2922 inputs[block->ninputs++] = NULL; 2923 inputs[block->ninputs++] = NULL; 2924 inputs[block->ninputs++] = NULL; 2925 } 2926 2927 /* since we don't know where to set the regid for frag_coord, 2928 * we have to use r0.x for it. But we don't want to *always* 2929 * use r1.x for frag_pos as that could increase the register 2930 * footprint on simple shaders: 2931 */ 2932 if (so->frag_coord) { 2933 ctx->frag_coord[0]->regs[0]->num = regid++; 2934 ctx->frag_coord[1]->regs[0]->num = regid++; 2935 ctx->frag_coord[2]->regs[0]->num = regid++; 2936 ctx->frag_coord[3]->regs[0]->num = regid++; 2937 2938 inputs[block->ninputs++] = ctx->frag_coord[0]; 2939 inputs[block->ninputs++] = ctx->frag_coord[1]; 2940 inputs[block->ninputs++] = ctx->frag_coord[2]; 2941 inputs[block->ninputs++] = ctx->frag_coord[3]; 2942 } 2943 2944 /* we always have frag_pos: */ 2945 so->pos_regid = regid; 2946 2947 /* r0.x */ 2948 instr = create_input(block, NULL, block->ninputs); 2949 instr->regs[0]->num = regid++; 2950 inputs[block->ninputs++] = instr; 2951 ctx->frag_pos->regs[1]->instr = instr; 2952 2953 /* r0.y */ 2954 instr = create_input(block, NULL, block->ninputs); 2955 instr->regs[0]->num = regid++; 2956 inputs[block->ninputs++] = instr; 2957 ctx->frag_pos->regs[2]->instr = instr; 2958 2959 block->inputs = inputs; 2960} 2961 2962static void 2963compile_instructions(struct ir3_compile_context *ctx) 2964{ 2965 push_block(ctx); 2966 2967 /* for fragment shader, we have a single input register (usually 2968 * r0.xy) which is used as the base for bary.f varying fetch instrs: 2969 */ 2970 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { 2971 struct ir3_instruction *instr; 2972 instr = ir3_instr_create(ctx->block, -1, OPC_META_FI); 2973 ir3_reg_create(instr, 0, 0); 2974 ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.x */ 2975 ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.y */ 2976 ctx->frag_pos = instr; 2977 } 2978 2979 while (!tgsi_parse_end_of_tokens(&ctx->parser)) { 2980 tgsi_parse_token(&ctx->parser); 2981 2982 switch (ctx->parser.FullToken.Token.Type) { 2983 case TGSI_TOKEN_TYPE_DECLARATION: { 2984 struct tgsi_full_declaration *decl = 2985 &ctx->parser.FullToken.FullDeclaration; 2986 if (decl->Declaration.File == TGSI_FILE_OUTPUT) { 2987 decl_out(ctx, decl); 2988 } else if (decl->Declaration.File == TGSI_FILE_INPUT) { 2989 decl_in(ctx, decl); 2990 } 2991 break; 2992 } 2993 case TGSI_TOKEN_TYPE_IMMEDIATE: { 2994 /* TODO: if we know the immediate is small enough, and only 2995 * used with instructions that can embed an immediate, we 2996 * can skip this: 2997 */ 2998 struct tgsi_full_immediate *imm = 2999 &ctx->parser.FullToken.FullImmediate; 3000 unsigned n = ctx->so->immediates_count++; 3001 compile_assert(ctx, n < ARRAY_SIZE(ctx->so->immediates)); 3002 memcpy(ctx->so->immediates[n].val, imm->u, 16); 3003 break; 3004 } 3005 case TGSI_TOKEN_TYPE_INSTRUCTION: { 3006 struct tgsi_full_instruction *inst = 3007 &ctx->parser.FullToken.FullInstruction; 3008 unsigned opc = inst->Instruction.Opcode; 3009 const struct instr_translater *t = &translaters[opc]; 3010 3011 if (t->fxn) { 3012 t->fxn(t, ctx, inst); 3013 ctx->num_internal_temps = 0; 3014 3015 compile_assert(ctx, !ctx->using_tmp_dst); 3016 } else { 3017 compile_error(ctx, "unknown TGSI opc: %s\n", 3018 tgsi_get_opcode_name(opc)); 3019 } 3020 3021 switch (inst->Instruction.Saturate) { 3022 case TGSI_SAT_ZERO_ONE: 3023 create_clamp_imm(ctx, &inst->Dst[0].Register, 3024 fui(0.0), fui(1.0)); 3025 break; 3026 case TGSI_SAT_MINUS_PLUS_ONE: 3027 create_clamp_imm(ctx, &inst->Dst[0].Register, 3028 fui(-1.0), fui(1.0)); 3029 break; 3030 } 3031 3032 instr_finish(ctx); 3033 3034 break; 3035 } 3036 default: 3037 break; 3038 } 3039 } 3040} 3041 3042static void 3043compile_dump(struct ir3_compile_context *ctx) 3044{ 3045 const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag"; 3046 static unsigned n = 0; 3047 char fname[16]; 3048 FILE *f; 3049 snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++); 3050 f = fopen(fname, "w"); 3051 if (!f) 3052 return; 3053 ir3_block_depth(ctx->block); 3054 ir3_dump(ctx->ir, name, ctx->block, f); 3055 fclose(f); 3056} 3057 3058int 3059ir3_compile_shader(struct ir3_shader_variant *so, 3060 const struct tgsi_token *tokens, struct ir3_shader_key key, 3061 bool cp) 3062{ 3063 struct ir3_compile_context ctx; 3064 struct ir3_block *block; 3065 struct ir3_instruction **inputs; 3066 unsigned i, j, actual_in; 3067 int ret = 0, max_bary; 3068 3069 assert(!so->ir); 3070 3071 so->ir = ir3_create(); 3072 3073 assert(so->ir); 3074 3075 if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK) { 3076 DBG("INIT failed!"); 3077 ret = -1; 3078 goto out; 3079 } 3080 3081 compile_instructions(&ctx); 3082 3083 block = ctx.block; 3084 so->ir->block = block; 3085 3086 /* keep track of the inputs from TGSI perspective.. */ 3087 inputs = block->inputs; 3088 3089 /* but fixup actual inputs for frag shader: */ 3090 if (ctx.type == TGSI_PROCESSOR_FRAGMENT) 3091 fixup_frag_inputs(&ctx); 3092 3093 /* at this point, for binning pass, throw away unneeded outputs: */ 3094 if (key.binning_pass) { 3095 for (i = 0, j = 0; i < so->outputs_count; i++) { 3096 unsigned name = sem2name(so->outputs[i].semantic); 3097 unsigned idx = sem2name(so->outputs[i].semantic); 3098 3099 /* throw away everything but first position/psize */ 3100 if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) || 3101 (name == TGSI_SEMANTIC_PSIZE))) { 3102 if (i != j) { 3103 so->outputs[j] = so->outputs[i]; 3104 block->outputs[(j*4)+0] = block->outputs[(i*4)+0]; 3105 block->outputs[(j*4)+1] = block->outputs[(i*4)+1]; 3106 block->outputs[(j*4)+2] = block->outputs[(i*4)+2]; 3107 block->outputs[(j*4)+3] = block->outputs[(i*4)+3]; 3108 } 3109 j++; 3110 } 3111 } 3112 so->outputs_count = j; 3113 block->noutputs = j * 4; 3114 } 3115 3116 /* for rendering to alpha format, we only need the .w component, 3117 * and we need it to be in the .x position: 3118 */ 3119 if (key.alpha) { 3120 for (i = 0, j = 0; i < so->outputs_count; i++) { 3121 unsigned name = sem2name(so->outputs[i].semantic); 3122 3123 /* move .w component to .x and discard others: */ 3124 if (name == TGSI_SEMANTIC_COLOR) { 3125 block->outputs[(i*4)+0] = block->outputs[(i*4)+3]; 3126 block->outputs[(i*4)+1] = NULL; 3127 block->outputs[(i*4)+2] = NULL; 3128 block->outputs[(i*4)+3] = NULL; 3129 } 3130 } 3131 } 3132 3133 /* at this point, we want the kill's in the outputs array too, 3134 * so that they get scheduled (since they have no dst).. we've 3135 * already ensured that the array is big enough in push_block(): 3136 */ 3137 if (ctx.type == TGSI_PROCESSOR_FRAGMENT) { 3138 for (i = 0; i < ctx.kill_count; i++) 3139 block->outputs[block->noutputs++] = ctx.kill[i]; 3140 } 3141 3142 if (fd_mesa_debug & FD_DBG_OPTDUMP) 3143 compile_dump(&ctx); 3144 3145 ret = ir3_block_flatten(block); 3146 if (ret < 0) { 3147 DBG("FLATTEN failed!"); 3148 goto out; 3149 } 3150 if ((ret > 0) && (fd_mesa_debug & FD_DBG_OPTDUMP)) 3151 compile_dump(&ctx); 3152 3153 if (fd_mesa_debug & FD_DBG_OPTMSGS) { 3154 printf("BEFORE CP:\n"); 3155 ir3_dump_instr_list(block->head); 3156 } 3157 3158 if (cp && !(fd_mesa_debug & FD_DBG_NOCP)) 3159 ir3_block_cp(block); 3160 3161 if (fd_mesa_debug & FD_DBG_OPTDUMP) 3162 compile_dump(&ctx); 3163 3164 ir3_block_depth(block); 3165 3166 if (fd_mesa_debug & FD_DBG_OPTMSGS) { 3167 printf("AFTER DEPTH:\n"); 3168 ir3_dump_instr_list(block->head); 3169 } 3170 3171 ret = ir3_block_sched(block); 3172 if (ret) { 3173 DBG("SCHED failed!"); 3174 goto out; 3175 } 3176 3177 if (fd_mesa_debug & FD_DBG_OPTMSGS) { 3178 printf("AFTER SCHED:\n"); 3179 ir3_dump_instr_list(block->head); 3180 } 3181 3182 ret = ir3_block_ra(block, so->type, key.half_precision, 3183 so->frag_coord, so->frag_face, &so->has_samp, &max_bary); 3184 if (ret) { 3185 DBG("RA failed!"); 3186 goto out; 3187 } 3188 3189 if (fd_mesa_debug & FD_DBG_OPTMSGS) { 3190 printf("AFTER RA:\n"); 3191 ir3_dump_instr_list(block->head); 3192 } 3193 3194 /* fixup input/outputs: */ 3195 for (i = 0; i < so->outputs_count; i++) { 3196 so->outputs[i].regid = block->outputs[i*4]->regs[0]->num; 3197 /* preserve hack for depth output.. tgsi writes depth to .z, 3198 * but what we give the hw is the scalar register: 3199 */ 3200 if ((ctx.type == TGSI_PROCESSOR_FRAGMENT) && 3201 (sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION)) 3202 so->outputs[i].regid += 2; 3203 } 3204 /* Note that some or all channels of an input may be unused: */ 3205 actual_in = 0; 3206 for (i = 0; i < so->inputs_count; i++) { 3207 unsigned j, regid = ~0, compmask = 0; 3208 so->inputs[i].ncomp = 0; 3209 for (j = 0; j < 4; j++) { 3210 struct ir3_instruction *in = inputs[(i*4) + j]; 3211 if (in) { 3212 compmask |= (1 << j); 3213 regid = in->regs[0]->num - j; 3214 actual_in++; 3215 so->inputs[i].ncomp++; 3216 } 3217 } 3218 so->inputs[i].regid = regid; 3219 so->inputs[i].compmask = compmask; 3220 } 3221 3222 /* fragment shader always gets full vec4's even if it doesn't 3223 * fetch all components, but vertex shader we need to update 3224 * with the actual number of components fetch, otherwise thing 3225 * will hang due to mismaptch between VFD_DECODE's and 3226 * TOTALATTRTOVS 3227 */ 3228 if (so->type == SHADER_VERTEX) 3229 so->total_in = actual_in; 3230 else 3231 so->total_in = align(max_bary + 1, 4); 3232 3233out: 3234 if (ret) { 3235 ir3_destroy(so->ir); 3236 so->ir = NULL; 3237 } 3238 compile_free(&ctx); 3239 3240 return ret; 3241} 3242