brw_vec4.cpp revision 830b075e86e3e9af1bf12316d0f9d888a85a973b
1/* 2 * Copyright © 2011 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "brw_vec4.h" 25#include "brw_fs.h" 26#include "brw_cfg.h" 27#include "brw_vs.h" 28#include "brw_nir.h" 29#include "brw_vec4_live_variables.h" 30#include "brw_dead_control_flow.h" 31#include "program/prog_parameter.h" 32 33#define MAX_INSTRUCTION (1 << 30) 34 35using namespace brw; 36 37namespace brw { 38 39void 40src_reg::init() 41{ 42 memset(this, 0, sizeof(*this)); 43 44 this->file = BAD_FILE; 45} 46 47src_reg::src_reg(enum brw_reg_file file, int nr, const glsl_type *type) 48{ 49 init(); 50 51 this->file = file; 52 this->nr = nr; 53 if (type && (type->is_scalar() || type->is_vector() || type->is_matrix())) 54 this->swizzle = brw_swizzle_for_size(type->vector_elements); 55 else 56 this->swizzle = BRW_SWIZZLE_XYZW; 57 if (type) 58 this->type = brw_type_for_base_type(type); 59} 60 61/** Generic unset register constructor. */ 62src_reg::src_reg() 63{ 64 init(); 65} 66 67src_reg::src_reg(struct ::brw_reg reg) : 68 backend_reg(reg) 69{ 70 this->reg_offset = 0; 71 this->reladdr = NULL; 72} 73 74src_reg::src_reg(const dst_reg ®) : 75 backend_reg(reg) 76{ 77 this->reladdr = reg.reladdr; 78 this->swizzle = brw_swizzle_for_mask(reg.writemask); 79} 80 81void 82dst_reg::init() 83{ 84 memset(this, 0, sizeof(*this)); 85 this->file = BAD_FILE; 86 this->writemask = WRITEMASK_XYZW; 87} 88 89dst_reg::dst_reg() 90{ 91 init(); 92} 93 94dst_reg::dst_reg(enum brw_reg_file file, int nr) 95{ 96 init(); 97 98 this->file = file; 99 this->nr = nr; 100} 101 102dst_reg::dst_reg(enum brw_reg_file file, int nr, const glsl_type *type, 103 unsigned writemask) 104{ 105 init(); 106 107 this->file = file; 108 this->nr = nr; 109 this->type = brw_type_for_base_type(type); 110 this->writemask = writemask; 111} 112 113dst_reg::dst_reg(enum brw_reg_file file, int nr, brw_reg_type type, 114 unsigned writemask) 115{ 116 init(); 117 118 this->file = file; 119 this->nr = nr; 120 this->type = type; 121 this->writemask = writemask; 122} 123 124dst_reg::dst_reg(struct ::brw_reg reg) : 125 backend_reg(reg) 126{ 127 this->reg_offset = 0; 128 this->reladdr = NULL; 129} 130 131dst_reg::dst_reg(const src_reg ®) : 132 backend_reg(reg) 133{ 134 this->writemask = brw_mask_for_swizzle(reg.swizzle); 135 this->reladdr = reg.reladdr; 136} 137 138bool 139dst_reg::equals(const dst_reg &r) const 140{ 141 return (this->backend_reg::equals(r) && 142 (reladdr == r.reladdr || 143 (reladdr && r.reladdr && reladdr->equals(*r.reladdr)))); 144} 145 146bool 147vec4_instruction::is_send_from_grf() 148{ 149 switch (opcode) { 150 case SHADER_OPCODE_SHADER_TIME_ADD: 151 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: 152 case SHADER_OPCODE_UNTYPED_ATOMIC: 153 case SHADER_OPCODE_UNTYPED_SURFACE_READ: 154 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: 155 case SHADER_OPCODE_TYPED_ATOMIC: 156 case SHADER_OPCODE_TYPED_SURFACE_READ: 157 case SHADER_OPCODE_TYPED_SURFACE_WRITE: 158 case VEC4_OPCODE_URB_READ: 159 case TCS_OPCODE_URB_WRITE: 160 case TCS_OPCODE_RELEASE_INPUT: 161 case SHADER_OPCODE_BARRIER: 162 return true; 163 default: 164 return false; 165 } 166} 167 168/** 169 * Returns true if this instruction's sources and destinations cannot 170 * safely be the same register. 171 * 172 * In most cases, a register can be written over safely by the same 173 * instruction that is its last use. For a single instruction, the 174 * sources are dereferenced before writing of the destination starts 175 * (naturally). 176 * 177 * However, there are a few cases where this can be problematic: 178 * 179 * - Virtual opcodes that translate to multiple instructions in the 180 * code generator: if src == dst and one instruction writes the 181 * destination before a later instruction reads the source, then 182 * src will have been clobbered. 183 * 184 * The register allocator uses this information to set up conflicts between 185 * GRF sources and the destination. 186 */ 187bool 188vec4_instruction::has_source_and_destination_hazard() const 189{ 190 switch (opcode) { 191 case TCS_OPCODE_SET_INPUT_URB_OFFSETS: 192 case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: 193 case TES_OPCODE_ADD_INDIRECT_URB_OFFSET: 194 return true; 195 default: 196 return false; 197 } 198} 199 200unsigned 201vec4_instruction::regs_read(unsigned arg) const 202{ 203 if (src[arg].file == BAD_FILE) 204 return 0; 205 206 switch (opcode) { 207 case SHADER_OPCODE_SHADER_TIME_ADD: 208 case SHADER_OPCODE_UNTYPED_ATOMIC: 209 case SHADER_OPCODE_UNTYPED_SURFACE_READ: 210 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: 211 case SHADER_OPCODE_TYPED_ATOMIC: 212 case SHADER_OPCODE_TYPED_SURFACE_READ: 213 case SHADER_OPCODE_TYPED_SURFACE_WRITE: 214 case TCS_OPCODE_URB_WRITE: 215 return arg == 0 ? mlen : 1; 216 217 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: 218 return arg == 1 ? mlen : 1; 219 220 default: 221 return 1; 222 } 223} 224 225bool 226vec4_instruction::can_do_source_mods(const struct brw_device_info *devinfo) 227{ 228 if (devinfo->gen == 6 && is_math()) 229 return false; 230 231 if (is_send_from_grf()) 232 return false; 233 234 if (!backend_instruction::can_do_source_mods()) 235 return false; 236 237 return true; 238} 239 240bool 241vec4_instruction::can_change_types() const 242{ 243 return dst.type == src[0].type && 244 !src[0].abs && !src[0].negate && !saturate && 245 (opcode == BRW_OPCODE_MOV || 246 (opcode == BRW_OPCODE_SEL && 247 dst.type == src[1].type && 248 predicate != BRW_PREDICATE_NONE && 249 !src[1].abs && !src[1].negate)); 250} 251 252/** 253 * Returns how many MRFs an opcode will write over. 254 * 255 * Note that this is not the 0 or 1 implied writes in an actual gen 256 * instruction -- the generate_* functions generate additional MOVs 257 * for setup. 258 */ 259int 260vec4_visitor::implied_mrf_writes(vec4_instruction *inst) 261{ 262 if (inst->mlen == 0 || inst->is_send_from_grf()) 263 return 0; 264 265 switch (inst->opcode) { 266 case SHADER_OPCODE_RCP: 267 case SHADER_OPCODE_RSQ: 268 case SHADER_OPCODE_SQRT: 269 case SHADER_OPCODE_EXP2: 270 case SHADER_OPCODE_LOG2: 271 case SHADER_OPCODE_SIN: 272 case SHADER_OPCODE_COS: 273 return 1; 274 case SHADER_OPCODE_INT_QUOTIENT: 275 case SHADER_OPCODE_INT_REMAINDER: 276 case SHADER_OPCODE_POW: 277 case TCS_OPCODE_THREAD_END: 278 return 2; 279 case VS_OPCODE_URB_WRITE: 280 return 1; 281 case VS_OPCODE_PULL_CONSTANT_LOAD: 282 return 2; 283 case SHADER_OPCODE_GEN4_SCRATCH_READ: 284 return 2; 285 case SHADER_OPCODE_GEN4_SCRATCH_WRITE: 286 return 3; 287 case GS_OPCODE_URB_WRITE: 288 case GS_OPCODE_URB_WRITE_ALLOCATE: 289 case GS_OPCODE_THREAD_END: 290 return 0; 291 case GS_OPCODE_FF_SYNC: 292 return 1; 293 case TCS_OPCODE_URB_WRITE: 294 return 0; 295 case SHADER_OPCODE_SHADER_TIME_ADD: 296 return 0; 297 case SHADER_OPCODE_TEX: 298 case SHADER_OPCODE_TXL: 299 case SHADER_OPCODE_TXD: 300 case SHADER_OPCODE_TXF: 301 case SHADER_OPCODE_TXF_CMS: 302 case SHADER_OPCODE_TXF_CMS_W: 303 case SHADER_OPCODE_TXF_MCS: 304 case SHADER_OPCODE_TXS: 305 case SHADER_OPCODE_TG4: 306 case SHADER_OPCODE_TG4_OFFSET: 307 case SHADER_OPCODE_SAMPLEINFO: 308 case VS_OPCODE_GET_BUFFER_SIZE: 309 return inst->header_size; 310 default: 311 unreachable("not reached"); 312 } 313} 314 315bool 316src_reg::equals(const src_reg &r) const 317{ 318 return (this->backend_reg::equals(r) && 319 !reladdr && !r.reladdr); 320} 321 322bool 323vec4_visitor::opt_vector_float() 324{ 325 bool progress = false; 326 327 int last_reg = -1, last_reg_offset = -1; 328 enum brw_reg_file last_reg_file = BAD_FILE; 329 330 int remaining_channels = 0; 331 uint8_t imm[4]; 332 int inst_count = 0; 333 vec4_instruction *imm_inst[4]; 334 335 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { 336 if (last_reg != inst->dst.nr || 337 last_reg_offset != inst->dst.reg_offset || 338 last_reg_file != inst->dst.file) { 339 last_reg = inst->dst.nr; 340 last_reg_offset = inst->dst.reg_offset; 341 last_reg_file = inst->dst.file; 342 remaining_channels = WRITEMASK_XYZW; 343 344 inst_count = 0; 345 } 346 347 if (inst->opcode != BRW_OPCODE_MOV || 348 inst->dst.writemask == WRITEMASK_XYZW || 349 inst->src[0].file != IMM) 350 continue; 351 352 int vf = brw_float_to_vf(inst->src[0].f); 353 if (vf == -1) 354 continue; 355 356 if ((inst->dst.writemask & WRITEMASK_X) != 0) 357 imm[0] = vf; 358 if ((inst->dst.writemask & WRITEMASK_Y) != 0) 359 imm[1] = vf; 360 if ((inst->dst.writemask & WRITEMASK_Z) != 0) 361 imm[2] = vf; 362 if ((inst->dst.writemask & WRITEMASK_W) != 0) 363 imm[3] = vf; 364 365 imm_inst[inst_count++] = inst; 366 367 remaining_channels &= ~inst->dst.writemask; 368 if (remaining_channels == 0) { 369 unsigned vf; 370 memcpy(&vf, imm, sizeof(vf)); 371 vec4_instruction *mov = MOV(inst->dst, brw_imm_vf(vf)); 372 mov->dst.type = BRW_REGISTER_TYPE_F; 373 mov->dst.writemask = WRITEMASK_XYZW; 374 inst->insert_after(block, mov); 375 last_reg = -1; 376 377 for (int i = 0; i < inst_count; i++) { 378 imm_inst[i]->remove(block); 379 } 380 progress = true; 381 } 382 } 383 384 if (progress) 385 invalidate_live_intervals(); 386 387 return progress; 388} 389 390/* Replaces unused channels of a swizzle with channels that are used. 391 * 392 * For instance, this pass transforms 393 * 394 * mov vgrf4.yz, vgrf5.wxzy 395 * 396 * into 397 * 398 * mov vgrf4.yz, vgrf5.xxzx 399 * 400 * This eliminates false uses of some channels, letting dead code elimination 401 * remove the instructions that wrote them. 402 */ 403bool 404vec4_visitor::opt_reduce_swizzle() 405{ 406 bool progress = false; 407 408 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { 409 if (inst->dst.file == BAD_FILE || 410 inst->dst.file == ARF || 411 inst->dst.file == FIXED_GRF || 412 inst->is_send_from_grf()) 413 continue; 414 415 unsigned swizzle; 416 417 /* Determine which channels of the sources are read. */ 418 switch (inst->opcode) { 419 case VEC4_OPCODE_PACK_BYTES: 420 case BRW_OPCODE_DP4: 421 case BRW_OPCODE_DPH: /* FINISHME: DPH reads only three channels of src0, 422 * but all four of src1. 423 */ 424 swizzle = brw_swizzle_for_size(4); 425 break; 426 case BRW_OPCODE_DP3: 427 swizzle = brw_swizzle_for_size(3); 428 break; 429 case BRW_OPCODE_DP2: 430 swizzle = brw_swizzle_for_size(2); 431 break; 432 default: 433 swizzle = brw_swizzle_for_mask(inst->dst.writemask); 434 break; 435 } 436 437 /* Update sources' swizzles. */ 438 for (int i = 0; i < 3; i++) { 439 if (inst->src[i].file != VGRF && 440 inst->src[i].file != ATTR && 441 inst->src[i].file != UNIFORM) 442 continue; 443 444 const unsigned new_swizzle = 445 brw_compose_swizzle(swizzle, inst->src[i].swizzle); 446 if (inst->src[i].swizzle != new_swizzle) { 447 inst->src[i].swizzle = new_swizzle; 448 progress = true; 449 } 450 } 451 } 452 453 if (progress) 454 invalidate_live_intervals(); 455 456 return progress; 457} 458 459void 460vec4_visitor::split_uniform_registers() 461{ 462 /* Prior to this, uniforms have been in an array sized according to 463 * the number of vector uniforms present, sparsely filled (so an 464 * aggregate results in reg indices being skipped over). Now we're 465 * going to cut those aggregates up so each .nr index is one 466 * vector. The goal is to make elimination of unused uniform 467 * components easier later. 468 */ 469 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 470 for (int i = 0 ; i < 3; i++) { 471 if (inst->src[i].file != UNIFORM) 472 continue; 473 474 assert(!inst->src[i].reladdr); 475 476 inst->src[i].nr += inst->src[i].reg_offset; 477 inst->src[i].reg_offset = 0; 478 } 479 } 480 481 /* Update that everything is now vector-sized. */ 482 for (int i = 0; i < this->uniforms; i++) { 483 this->uniform_size[i] = 1; 484 } 485} 486 487void 488vec4_visitor::pack_uniform_registers() 489{ 490 uint8_t chans_used[this->uniforms]; 491 int new_loc[this->uniforms]; 492 int new_chan[this->uniforms]; 493 494 memset(chans_used, 0, sizeof(chans_used)); 495 memset(new_loc, 0, sizeof(new_loc)); 496 memset(new_chan, 0, sizeof(new_chan)); 497 498 /* Find which uniform vectors are actually used by the program. We 499 * expect unused vector elements when we've moved array access out 500 * to pull constants, and from some GLSL code generators like wine. 501 */ 502 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 503 unsigned readmask; 504 switch (inst->opcode) { 505 case VEC4_OPCODE_PACK_BYTES: 506 case BRW_OPCODE_DP4: 507 case BRW_OPCODE_DPH: 508 readmask = 0xf; 509 break; 510 case BRW_OPCODE_DP3: 511 readmask = 0x7; 512 break; 513 case BRW_OPCODE_DP2: 514 readmask = 0x3; 515 break; 516 default: 517 readmask = inst->dst.writemask; 518 break; 519 } 520 521 for (int i = 0 ; i < 3; i++) { 522 if (inst->src[i].file != UNIFORM) 523 continue; 524 525 int reg = inst->src[i].nr; 526 for (int c = 0; c < 4; c++) { 527 if (!(readmask & (1 << c))) 528 continue; 529 530 chans_used[reg] = MAX2(chans_used[reg], 531 BRW_GET_SWZ(inst->src[i].swizzle, c) + 1); 532 } 533 } 534 } 535 536 int new_uniform_count = 0; 537 538 /* Now, figure out a packing of the live uniform vectors into our 539 * push constants. 540 */ 541 for (int src = 0; src < uniforms; src++) { 542 assert(src < uniform_array_size); 543 int size = chans_used[src]; 544 545 if (size == 0) 546 continue; 547 548 int dst; 549 /* Find the lowest place we can slot this uniform in. */ 550 for (dst = 0; dst < src; dst++) { 551 if (chans_used[dst] + size <= 4) 552 break; 553 } 554 555 if (src == dst) { 556 new_loc[src] = dst; 557 new_chan[src] = 0; 558 } else { 559 new_loc[src] = dst; 560 new_chan[src] = chans_used[dst]; 561 562 /* Move the references to the data */ 563 for (int j = 0; j < size; j++) { 564 stage_prog_data->param[dst * 4 + new_chan[src] + j] = 565 stage_prog_data->param[src * 4 + j]; 566 } 567 568 chans_used[dst] += size; 569 chans_used[src] = 0; 570 } 571 572 new_uniform_count = MAX2(new_uniform_count, dst + 1); 573 } 574 575 this->uniforms = new_uniform_count; 576 577 /* Now, update the instructions for our repacked uniforms. */ 578 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 579 for (int i = 0 ; i < 3; i++) { 580 int src = inst->src[i].nr; 581 582 if (inst->src[i].file != UNIFORM) 583 continue; 584 585 inst->src[i].nr = new_loc[src]; 586 inst->src[i].swizzle += BRW_SWIZZLE4(new_chan[src], new_chan[src], 587 new_chan[src], new_chan[src]); 588 } 589 } 590} 591 592/** 593 * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a). 594 * 595 * While GLSL IR also performs this optimization, we end up with it in 596 * our instruction stream for a couple of reasons. One is that we 597 * sometimes generate silly instructions, for example in array access 598 * where we'll generate "ADD offset, index, base" even if base is 0. 599 * The other is that GLSL IR's constant propagation doesn't track the 600 * components of aggregates, so some VS patterns (initialize matrix to 601 * 0, accumulate in vertex blending factors) end up breaking down to 602 * instructions involving 0. 603 */ 604bool 605vec4_visitor::opt_algebraic() 606{ 607 bool progress = false; 608 609 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 610 switch (inst->opcode) { 611 case BRW_OPCODE_MOV: 612 if (inst->src[0].file != IMM) 613 break; 614 615 if (inst->saturate) { 616 if (inst->dst.type != inst->src[0].type) 617 assert(!"unimplemented: saturate mixed types"); 618 619 if (brw_saturate_immediate(inst->dst.type, 620 &inst->src[0].as_brw_reg())) { 621 inst->saturate = false; 622 progress = true; 623 } 624 } 625 break; 626 627 case VEC4_OPCODE_UNPACK_UNIFORM: 628 if (inst->src[0].file != UNIFORM) { 629 inst->opcode = BRW_OPCODE_MOV; 630 progress = true; 631 } 632 break; 633 634 case BRW_OPCODE_ADD: 635 if (inst->src[1].is_zero()) { 636 inst->opcode = BRW_OPCODE_MOV; 637 inst->src[1] = src_reg(); 638 progress = true; 639 } 640 break; 641 642 case BRW_OPCODE_MUL: 643 if (inst->src[1].is_zero()) { 644 inst->opcode = BRW_OPCODE_MOV; 645 switch (inst->src[0].type) { 646 case BRW_REGISTER_TYPE_F: 647 inst->src[0] = brw_imm_f(0.0f); 648 break; 649 case BRW_REGISTER_TYPE_D: 650 inst->src[0] = brw_imm_d(0); 651 break; 652 case BRW_REGISTER_TYPE_UD: 653 inst->src[0] = brw_imm_ud(0u); 654 break; 655 default: 656 unreachable("not reached"); 657 } 658 inst->src[1] = src_reg(); 659 progress = true; 660 } else if (inst->src[1].is_one()) { 661 inst->opcode = BRW_OPCODE_MOV; 662 inst->src[1] = src_reg(); 663 progress = true; 664 } else if (inst->src[1].is_negative_one()) { 665 inst->opcode = BRW_OPCODE_MOV; 666 inst->src[0].negate = !inst->src[0].negate; 667 inst->src[1] = src_reg(); 668 progress = true; 669 } 670 break; 671 case BRW_OPCODE_CMP: 672 if (inst->conditional_mod == BRW_CONDITIONAL_GE && 673 inst->src[0].abs && 674 inst->src[0].negate && 675 inst->src[1].is_zero()) { 676 inst->src[0].abs = false; 677 inst->src[0].negate = false; 678 inst->conditional_mod = BRW_CONDITIONAL_Z; 679 progress = true; 680 break; 681 } 682 break; 683 case SHADER_OPCODE_RCP: { 684 vec4_instruction *prev = (vec4_instruction *)inst->prev; 685 if (prev->opcode == SHADER_OPCODE_SQRT) { 686 if (inst->src[0].equals(src_reg(prev->dst))) { 687 inst->opcode = SHADER_OPCODE_RSQ; 688 inst->src[0] = prev->src[0]; 689 progress = true; 690 } 691 } 692 break; 693 } 694 case SHADER_OPCODE_BROADCAST: 695 if (is_uniform(inst->src[0]) || 696 inst->src[1].is_zero()) { 697 inst->opcode = BRW_OPCODE_MOV; 698 inst->src[1] = src_reg(); 699 inst->force_writemask_all = true; 700 progress = true; 701 } 702 break; 703 704 default: 705 break; 706 } 707 } 708 709 if (progress) 710 invalidate_live_intervals(); 711 712 return progress; 713} 714 715/** 716 * Only a limited number of hardware registers may be used for push 717 * constants, so this turns access to the overflowed constants into 718 * pull constants. 719 */ 720void 721vec4_visitor::move_push_constants_to_pull_constants() 722{ 723 int pull_constant_loc[this->uniforms]; 724 725 /* Only allow 32 registers (256 uniform components) as push constants, 726 * which is the limit on gen6. 727 * 728 * If changing this value, note the limitation about total_regs in 729 * brw_curbe.c. 730 */ 731 int max_uniform_components = 32 * 8; 732 if (this->uniforms * 4 <= max_uniform_components) 733 return; 734 735 /* Make some sort of choice as to which uniforms get sent to pull 736 * constants. We could potentially do something clever here like 737 * look for the most infrequently used uniform vec4s, but leave 738 * that for later. 739 */ 740 for (int i = 0; i < this->uniforms * 4; i += 4) { 741 pull_constant_loc[i / 4] = -1; 742 743 if (i >= max_uniform_components) { 744 const gl_constant_value **values = &stage_prog_data->param[i]; 745 746 /* Try to find an existing copy of this uniform in the pull 747 * constants if it was part of an array access already. 748 */ 749 for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j += 4) { 750 int matches; 751 752 for (matches = 0; matches < 4; matches++) { 753 if (stage_prog_data->pull_param[j + matches] != values[matches]) 754 break; 755 } 756 757 if (matches == 4) { 758 pull_constant_loc[i / 4] = j / 4; 759 break; 760 } 761 } 762 763 if (pull_constant_loc[i / 4] == -1) { 764 assert(stage_prog_data->nr_pull_params % 4 == 0); 765 pull_constant_loc[i / 4] = stage_prog_data->nr_pull_params / 4; 766 767 for (int j = 0; j < 4; j++) { 768 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] = 769 values[j]; 770 } 771 } 772 } 773 } 774 775 /* Now actually rewrite usage of the things we've moved to pull 776 * constants. 777 */ 778 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { 779 for (int i = 0 ; i < 3; i++) { 780 if (inst->src[i].file != UNIFORM || 781 pull_constant_loc[inst->src[i].nr] == -1) 782 continue; 783 784 int uniform = inst->src[i].nr; 785 786 dst_reg temp = dst_reg(this, glsl_type::vec4_type); 787 788 emit_pull_constant_load(block, inst, temp, inst->src[i], 789 pull_constant_loc[uniform]); 790 791 inst->src[i].file = temp.file; 792 inst->src[i].nr = temp.nr; 793 inst->src[i].reg_offset = temp.reg_offset; 794 inst->src[i].reladdr = NULL; 795 } 796 } 797 798 /* Repack push constants to remove the now-unused ones. */ 799 pack_uniform_registers(); 800} 801 802/* Conditions for which we want to avoid setting the dependency control bits */ 803bool 804vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst) 805{ 806#define IS_DWORD(reg) \ 807 (reg.type == BRW_REGISTER_TYPE_UD || \ 808 reg.type == BRW_REGISTER_TYPE_D) 809 810 /* "When source or destination datatype is 64b or operation is integer DWord 811 * multiply, DepCtrl must not be used." 812 * May apply to future SoCs as well. 813 */ 814 if (devinfo->is_cherryview) { 815 if (inst->opcode == BRW_OPCODE_MUL && 816 IS_DWORD(inst->src[0]) && 817 IS_DWORD(inst->src[1])) 818 return true; 819 } 820#undef IS_DWORD 821 822 if (devinfo->gen >= 8) { 823 if (inst->opcode == BRW_OPCODE_F32TO16) 824 return true; 825 } 826 827 /* 828 * mlen: 829 * In the presence of send messages, totally interrupt dependency 830 * control. They're long enough that the chance of dependency 831 * control around them just doesn't matter. 832 * 833 * predicate: 834 * From the Ivy Bridge PRM, volume 4 part 3.7, page 80: 835 * When a sequence of NoDDChk and NoDDClr are used, the last instruction that 836 * completes the scoreboard clear must have a non-zero execution mask. This 837 * means, if any kind of predication can change the execution mask or channel 838 * enable of the last instruction, the optimization must be avoided. This is 839 * to avoid instructions being shot down the pipeline when no writes are 840 * required. 841 * 842 * math: 843 * Dependency control does not work well over math instructions. 844 * NB: Discovered empirically 845 */ 846 return (inst->mlen || inst->predicate || inst->is_math()); 847} 848 849/** 850 * Sets the dependency control fields on instructions after register 851 * allocation and before the generator is run. 852 * 853 * When you have a sequence of instructions like: 854 * 855 * DP4 temp.x vertex uniform[0] 856 * DP4 temp.y vertex uniform[0] 857 * DP4 temp.z vertex uniform[0] 858 * DP4 temp.w vertex uniform[0] 859 * 860 * The hardware doesn't know that it can actually run the later instructions 861 * while the previous ones are in flight, producing stalls. However, we have 862 * manual fields we can set in the instructions that let it do so. 863 */ 864void 865vec4_visitor::opt_set_dependency_control() 866{ 867 vec4_instruction *last_grf_write[BRW_MAX_GRF]; 868 uint8_t grf_channels_written[BRW_MAX_GRF]; 869 vec4_instruction *last_mrf_write[BRW_MAX_GRF]; 870 uint8_t mrf_channels_written[BRW_MAX_GRF]; 871 872 assert(prog_data->total_grf || 873 !"Must be called after register allocation"); 874 875 foreach_block (block, cfg) { 876 memset(last_grf_write, 0, sizeof(last_grf_write)); 877 memset(last_mrf_write, 0, sizeof(last_mrf_write)); 878 879 foreach_inst_in_block (vec4_instruction, inst, block) { 880 /* If we read from a register that we were doing dependency control 881 * on, don't do dependency control across the read. 882 */ 883 for (int i = 0; i < 3; i++) { 884 int reg = inst->src[i].nr + inst->src[i].reg_offset; 885 if (inst->src[i].file == VGRF) { 886 last_grf_write[reg] = NULL; 887 } else if (inst->src[i].file == FIXED_GRF) { 888 memset(last_grf_write, 0, sizeof(last_grf_write)); 889 break; 890 } 891 assert(inst->src[i].file != MRF); 892 } 893 894 if (is_dep_ctrl_unsafe(inst)) { 895 memset(last_grf_write, 0, sizeof(last_grf_write)); 896 memset(last_mrf_write, 0, sizeof(last_mrf_write)); 897 continue; 898 } 899 900 /* Now, see if we can do dependency control for this instruction 901 * against a previous one writing to its destination. 902 */ 903 int reg = inst->dst.nr + inst->dst.reg_offset; 904 if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) { 905 if (last_grf_write[reg] && 906 !(inst->dst.writemask & grf_channels_written[reg])) { 907 last_grf_write[reg]->no_dd_clear = true; 908 inst->no_dd_check = true; 909 } else { 910 grf_channels_written[reg] = 0; 911 } 912 913 last_grf_write[reg] = inst; 914 grf_channels_written[reg] |= inst->dst.writemask; 915 } else if (inst->dst.file == MRF) { 916 if (last_mrf_write[reg] && 917 !(inst->dst.writemask & mrf_channels_written[reg])) { 918 last_mrf_write[reg]->no_dd_clear = true; 919 inst->no_dd_check = true; 920 } else { 921 mrf_channels_written[reg] = 0; 922 } 923 924 last_mrf_write[reg] = inst; 925 mrf_channels_written[reg] |= inst->dst.writemask; 926 } 927 } 928 } 929} 930 931bool 932vec4_instruction::can_reswizzle(const struct brw_device_info *devinfo, 933 int dst_writemask, 934 int swizzle, 935 int swizzle_mask) 936{ 937 /* Gen6 MATH instructions can not execute in align16 mode, so swizzles 938 * or writemasking are not allowed. 939 */ 940 if (devinfo->gen == 6 && is_math() && 941 (swizzle != BRW_SWIZZLE_XYZW || dst_writemask != WRITEMASK_XYZW)) 942 return false; 943 944 /* If this instruction sets anything not referenced by swizzle, then we'd 945 * totally break it when we reswizzle. 946 */ 947 if (dst.writemask & ~swizzle_mask) 948 return false; 949 950 if (mlen > 0) 951 return false; 952 953 for (int i = 0; i < 3; i++) { 954 if (src[i].is_accumulator()) 955 return false; 956 } 957 958 return true; 959} 960 961/** 962 * For any channels in the swizzle's source that were populated by this 963 * instruction, rewrite the instruction to put the appropriate result directly 964 * in those channels. 965 * 966 * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x 967 */ 968void 969vec4_instruction::reswizzle(int dst_writemask, int swizzle) 970{ 971 /* Destination write mask doesn't correspond to source swizzle for the dot 972 * product and pack_bytes instructions. 973 */ 974 if (opcode != BRW_OPCODE_DP4 && opcode != BRW_OPCODE_DPH && 975 opcode != BRW_OPCODE_DP3 && opcode != BRW_OPCODE_DP2 && 976 opcode != VEC4_OPCODE_PACK_BYTES) { 977 for (int i = 0; i < 3; i++) { 978 if (src[i].file == BAD_FILE || src[i].file == IMM) 979 continue; 980 981 src[i].swizzle = brw_compose_swizzle(swizzle, src[i].swizzle); 982 } 983 } 984 985 /* Apply the specified swizzle and writemask to the original mask of 986 * written components. 987 */ 988 dst.writemask = dst_writemask & 989 brw_apply_swizzle_to_mask(swizzle, dst.writemask); 990} 991 992/* 993 * Tries to reduce extra MOV instructions by taking temporary GRFs that get 994 * just written and then MOVed into another reg and making the original write 995 * of the GRF write directly to the final destination instead. 996 */ 997bool 998vec4_visitor::opt_register_coalesce() 999{ 1000 bool progress = false; 1001 int next_ip = 0; 1002 1003 calculate_live_intervals(); 1004 1005 foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) { 1006 int ip = next_ip; 1007 next_ip++; 1008 1009 if (inst->opcode != BRW_OPCODE_MOV || 1010 (inst->dst.file != VGRF && inst->dst.file != MRF) || 1011 inst->predicate || 1012 inst->src[0].file != VGRF || 1013 inst->dst.type != inst->src[0].type || 1014 inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr) 1015 continue; 1016 1017 /* Remove no-op MOVs */ 1018 if (inst->dst.file == inst->src[0].file && 1019 inst->dst.nr == inst->src[0].nr && 1020 inst->dst.reg_offset == inst->src[0].reg_offset) { 1021 bool is_nop_mov = true; 1022 1023 for (unsigned c = 0; c < 4; c++) { 1024 if ((inst->dst.writemask & (1 << c)) == 0) 1025 continue; 1026 1027 if (BRW_GET_SWZ(inst->src[0].swizzle, c) != c) { 1028 is_nop_mov = false; 1029 break; 1030 } 1031 } 1032 1033 if (is_nop_mov) { 1034 inst->remove(block); 1035 continue; 1036 } 1037 } 1038 1039 bool to_mrf = (inst->dst.file == MRF); 1040 1041 /* Can't coalesce this GRF if someone else was going to 1042 * read it later. 1043 */ 1044 if (var_range_end(var_from_reg(alloc, inst->src[0]), 4) > ip) 1045 continue; 1046 1047 /* We need to check interference with the final destination between this 1048 * instruction and the earliest instruction involved in writing the GRF 1049 * we're eliminating. To do that, keep track of which of our source 1050 * channels we've seen initialized. 1051 */ 1052 const unsigned chans_needed = 1053 brw_apply_inv_swizzle_to_mask(inst->src[0].swizzle, 1054 inst->dst.writemask); 1055 unsigned chans_remaining = chans_needed; 1056 1057 /* Now walk up the instruction stream trying to see if we can rewrite 1058 * everything writing to the temporary to write into the destination 1059 * instead. 1060 */ 1061 vec4_instruction *_scan_inst = (vec4_instruction *)inst->prev; 1062 foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst, 1063 inst) { 1064 _scan_inst = scan_inst; 1065 1066 if (inst->src[0].in_range(scan_inst->dst, scan_inst->regs_written)) { 1067 /* Found something writing to the reg we want to coalesce away. */ 1068 if (to_mrf) { 1069 /* SEND instructions can't have MRF as a destination. */ 1070 if (scan_inst->mlen) 1071 break; 1072 1073 if (devinfo->gen == 6) { 1074 /* gen6 math instructions must have the destination be 1075 * VGRF, so no compute-to-MRF for them. 1076 */ 1077 if (scan_inst->is_math()) { 1078 break; 1079 } 1080 } 1081 } 1082 1083 /* This doesn't handle saturation on the instruction we 1084 * want to coalesce away if the register types do not match. 1085 * But if scan_inst is a non type-converting 'mov', we can fix 1086 * the types later. 1087 */ 1088 if (inst->saturate && 1089 inst->dst.type != scan_inst->dst.type && 1090 !(scan_inst->opcode == BRW_OPCODE_MOV && 1091 scan_inst->dst.type == scan_inst->src[0].type)) 1092 break; 1093 1094 /* If we can't handle the swizzle, bail. */ 1095 if (!scan_inst->can_reswizzle(devinfo, inst->dst.writemask, 1096 inst->src[0].swizzle, 1097 chans_needed)) { 1098 break; 1099 } 1100 1101 /* This doesn't handle coalescing of multiple registers. */ 1102 if (scan_inst->regs_written > 1) 1103 break; 1104 1105 /* Mark which channels we found unconditional writes for. */ 1106 if (!scan_inst->predicate) 1107 chans_remaining &= ~scan_inst->dst.writemask; 1108 1109 if (chans_remaining == 0) 1110 break; 1111 } 1112 1113 /* You can't read from an MRF, so if someone else reads our MRF's 1114 * source GRF that we wanted to rewrite, that stops us. If it's a 1115 * GRF we're trying to coalesce to, we don't actually handle 1116 * rewriting sources so bail in that case as well. 1117 */ 1118 bool interfered = false; 1119 for (int i = 0; i < 3; i++) { 1120 if (inst->src[0].in_range(scan_inst->src[i], 1121 scan_inst->regs_read(i))) 1122 interfered = true; 1123 } 1124 if (interfered) 1125 break; 1126 1127 /* If somebody else writes the same channels of our destination here, 1128 * we can't coalesce before that. 1129 */ 1130 if (inst->dst.in_range(scan_inst->dst, scan_inst->regs_written) && 1131 (inst->dst.writemask & scan_inst->dst.writemask) != 0) { 1132 break; 1133 } 1134 1135 /* Check for reads of the register we're trying to coalesce into. We 1136 * can't go rewriting instructions above that to put some other value 1137 * in the register instead. 1138 */ 1139 if (to_mrf && scan_inst->mlen > 0) { 1140 if (inst->dst.nr >= scan_inst->base_mrf && 1141 inst->dst.nr < scan_inst->base_mrf + scan_inst->mlen) { 1142 break; 1143 } 1144 } else { 1145 for (int i = 0; i < 3; i++) { 1146 if (inst->dst.in_range(scan_inst->src[i], 1147 scan_inst->regs_read(i))) 1148 interfered = true; 1149 } 1150 if (interfered) 1151 break; 1152 } 1153 } 1154 1155 if (chans_remaining == 0) { 1156 /* If we've made it here, we have an MOV we want to coalesce out, and 1157 * a scan_inst pointing to the earliest instruction involved in 1158 * computing the value. Now go rewrite the instruction stream 1159 * between the two. 1160 */ 1161 vec4_instruction *scan_inst = _scan_inst; 1162 while (scan_inst != inst) { 1163 if (scan_inst->dst.file == VGRF && 1164 scan_inst->dst.nr == inst->src[0].nr && 1165 scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 1166 scan_inst->reswizzle(inst->dst.writemask, 1167 inst->src[0].swizzle); 1168 scan_inst->dst.file = inst->dst.file; 1169 scan_inst->dst.nr = inst->dst.nr; 1170 scan_inst->dst.reg_offset = inst->dst.reg_offset; 1171 if (inst->saturate && 1172 inst->dst.type != scan_inst->dst.type) { 1173 /* If we have reached this point, scan_inst is a non 1174 * type-converting 'mov' and we can modify its register types 1175 * to match the ones in inst. Otherwise, we could have an 1176 * incorrect saturation result. 1177 */ 1178 scan_inst->dst.type = inst->dst.type; 1179 scan_inst->src[0].type = inst->src[0].type; 1180 } 1181 scan_inst->saturate |= inst->saturate; 1182 } 1183 scan_inst = (vec4_instruction *)scan_inst->next; 1184 } 1185 inst->remove(block); 1186 progress = true; 1187 } 1188 } 1189 1190 if (progress) 1191 invalidate_live_intervals(); 1192 1193 return progress; 1194} 1195 1196/** 1197 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control 1198 * flow. We could probably do better here with some form of divergence 1199 * analysis. 1200 */ 1201bool 1202vec4_visitor::eliminate_find_live_channel() 1203{ 1204 bool progress = false; 1205 unsigned depth = 0; 1206 1207 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { 1208 switch (inst->opcode) { 1209 case BRW_OPCODE_IF: 1210 case BRW_OPCODE_DO: 1211 depth++; 1212 break; 1213 1214 case BRW_OPCODE_ENDIF: 1215 case BRW_OPCODE_WHILE: 1216 depth--; 1217 break; 1218 1219 case SHADER_OPCODE_FIND_LIVE_CHANNEL: 1220 if (depth == 0) { 1221 inst->opcode = BRW_OPCODE_MOV; 1222 inst->src[0] = brw_imm_d(0); 1223 inst->force_writemask_all = true; 1224 progress = true; 1225 } 1226 break; 1227 1228 default: 1229 break; 1230 } 1231 } 1232 1233 return progress; 1234} 1235 1236/** 1237 * Splits virtual GRFs requesting more than one contiguous physical register. 1238 * 1239 * We initially create large virtual GRFs for temporary structures, arrays, 1240 * and matrices, so that the dereference visitor functions can add reg_offsets 1241 * to work their way down to the actual member being accessed. But when it 1242 * comes to optimization, we'd like to treat each register as individual 1243 * storage if possible. 1244 * 1245 * So far, the only thing that might prevent splitting is a send message from 1246 * a GRF on IVB. 1247 */ 1248void 1249vec4_visitor::split_virtual_grfs() 1250{ 1251 int num_vars = this->alloc.count; 1252 int new_virtual_grf[num_vars]; 1253 bool split_grf[num_vars]; 1254 1255 memset(new_virtual_grf, 0, sizeof(new_virtual_grf)); 1256 1257 /* Try to split anything > 0 sized. */ 1258 for (int i = 0; i < num_vars; i++) { 1259 split_grf[i] = this->alloc.sizes[i] != 1; 1260 } 1261 1262 /* Check that the instructions are compatible with the registers we're trying 1263 * to split. 1264 */ 1265 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 1266 if (inst->dst.file == VGRF && inst->regs_written > 1) 1267 split_grf[inst->dst.nr] = false; 1268 1269 for (int i = 0; i < 3; i++) { 1270 if (inst->src[i].file == VGRF && inst->regs_read(i) > 1) 1271 split_grf[inst->src[i].nr] = false; 1272 } 1273 } 1274 1275 /* Allocate new space for split regs. Note that the virtual 1276 * numbers will be contiguous. 1277 */ 1278 for (int i = 0; i < num_vars; i++) { 1279 if (!split_grf[i]) 1280 continue; 1281 1282 new_virtual_grf[i] = alloc.allocate(1); 1283 for (unsigned j = 2; j < this->alloc.sizes[i]; j++) { 1284 unsigned reg = alloc.allocate(1); 1285 assert(reg == new_virtual_grf[i] + j - 1); 1286 (void) reg; 1287 } 1288 this->alloc.sizes[i] = 1; 1289 } 1290 1291 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 1292 if (inst->dst.file == VGRF && split_grf[inst->dst.nr] && 1293 inst->dst.reg_offset != 0) { 1294 inst->dst.nr = (new_virtual_grf[inst->dst.nr] + 1295 inst->dst.reg_offset - 1); 1296 inst->dst.reg_offset = 0; 1297 } 1298 for (int i = 0; i < 3; i++) { 1299 if (inst->src[i].file == VGRF && split_grf[inst->src[i].nr] && 1300 inst->src[i].reg_offset != 0) { 1301 inst->src[i].nr = (new_virtual_grf[inst->src[i].nr] + 1302 inst->src[i].reg_offset - 1); 1303 inst->src[i].reg_offset = 0; 1304 } 1305 } 1306 } 1307 invalidate_live_intervals(); 1308} 1309 1310void 1311vec4_visitor::dump_instruction(backend_instruction *be_inst) 1312{ 1313 dump_instruction(be_inst, stderr); 1314} 1315 1316void 1317vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) 1318{ 1319 vec4_instruction *inst = (vec4_instruction *)be_inst; 1320 1321 if (inst->predicate) { 1322 fprintf(file, "(%cf0.%d%s) ", 1323 inst->predicate_inverse ? '-' : '+', 1324 inst->flag_subreg, 1325 pred_ctrl_align16[inst->predicate]); 1326 } 1327 1328 fprintf(file, "%s", brw_instruction_name(inst->opcode)); 1329 if (inst->saturate) 1330 fprintf(file, ".sat"); 1331 if (inst->conditional_mod) { 1332 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]); 1333 if (!inst->predicate && 1334 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL && 1335 inst->opcode != BRW_OPCODE_IF && 1336 inst->opcode != BRW_OPCODE_WHILE))) { 1337 fprintf(file, ".f0.%d", inst->flag_subreg); 1338 } 1339 } 1340 fprintf(file, " "); 1341 1342 switch (inst->dst.file) { 1343 case VGRF: 1344 fprintf(file, "vgrf%d.%d", inst->dst.nr, inst->dst.reg_offset); 1345 break; 1346 case FIXED_GRF: 1347 fprintf(file, "g%d", inst->dst.nr); 1348 break; 1349 case MRF: 1350 fprintf(file, "m%d", inst->dst.nr); 1351 break; 1352 case ARF: 1353 switch (inst->dst.nr) { 1354 case BRW_ARF_NULL: 1355 fprintf(file, "null"); 1356 break; 1357 case BRW_ARF_ADDRESS: 1358 fprintf(file, "a0.%d", inst->dst.subnr); 1359 break; 1360 case BRW_ARF_ACCUMULATOR: 1361 fprintf(file, "acc%d", inst->dst.subnr); 1362 break; 1363 case BRW_ARF_FLAG: 1364 fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr); 1365 break; 1366 default: 1367 fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr); 1368 break; 1369 } 1370 if (inst->dst.subnr) 1371 fprintf(file, "+%d", inst->dst.subnr); 1372 break; 1373 case BAD_FILE: 1374 fprintf(file, "(null)"); 1375 break; 1376 case IMM: 1377 case ATTR: 1378 case UNIFORM: 1379 unreachable("not reached"); 1380 } 1381 if (inst->dst.writemask != WRITEMASK_XYZW) { 1382 fprintf(file, "."); 1383 if (inst->dst.writemask & 1) 1384 fprintf(file, "x"); 1385 if (inst->dst.writemask & 2) 1386 fprintf(file, "y"); 1387 if (inst->dst.writemask & 4) 1388 fprintf(file, "z"); 1389 if (inst->dst.writemask & 8) 1390 fprintf(file, "w"); 1391 } 1392 fprintf(file, ":%s", brw_reg_type_letters(inst->dst.type)); 1393 1394 if (inst->src[0].file != BAD_FILE) 1395 fprintf(file, ", "); 1396 1397 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) { 1398 if (inst->src[i].negate) 1399 fprintf(file, "-"); 1400 if (inst->src[i].abs) 1401 fprintf(file, "|"); 1402 switch (inst->src[i].file) { 1403 case VGRF: 1404 fprintf(file, "vgrf%d", inst->src[i].nr); 1405 break; 1406 case FIXED_GRF: 1407 fprintf(file, "g%d", inst->src[i].nr); 1408 break; 1409 case ATTR: 1410 fprintf(file, "attr%d", inst->src[i].nr); 1411 break; 1412 case UNIFORM: 1413 fprintf(file, "u%d", inst->src[i].nr); 1414 break; 1415 case IMM: 1416 switch (inst->src[i].type) { 1417 case BRW_REGISTER_TYPE_F: 1418 fprintf(file, "%fF", inst->src[i].f); 1419 break; 1420 case BRW_REGISTER_TYPE_D: 1421 fprintf(file, "%dD", inst->src[i].d); 1422 break; 1423 case BRW_REGISTER_TYPE_UD: 1424 fprintf(file, "%uU", inst->src[i].ud); 1425 break; 1426 case BRW_REGISTER_TYPE_VF: 1427 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]", 1428 brw_vf_to_float((inst->src[i].ud >> 0) & 0xff), 1429 brw_vf_to_float((inst->src[i].ud >> 8) & 0xff), 1430 brw_vf_to_float((inst->src[i].ud >> 16) & 0xff), 1431 brw_vf_to_float((inst->src[i].ud >> 24) & 0xff)); 1432 break; 1433 default: 1434 fprintf(file, "???"); 1435 break; 1436 } 1437 break; 1438 case ARF: 1439 switch (inst->src[i].nr) { 1440 case BRW_ARF_NULL: 1441 fprintf(file, "null"); 1442 break; 1443 case BRW_ARF_ADDRESS: 1444 fprintf(file, "a0.%d", inst->src[i].subnr); 1445 break; 1446 case BRW_ARF_ACCUMULATOR: 1447 fprintf(file, "acc%d", inst->src[i].subnr); 1448 break; 1449 case BRW_ARF_FLAG: 1450 fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr); 1451 break; 1452 default: 1453 fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr); 1454 break; 1455 } 1456 if (inst->src[i].subnr) 1457 fprintf(file, "+%d", inst->src[i].subnr); 1458 break; 1459 case BAD_FILE: 1460 fprintf(file, "(null)"); 1461 break; 1462 case MRF: 1463 unreachable("not reached"); 1464 } 1465 1466 /* Don't print .0; and only VGRFs have reg_offsets and sizes */ 1467 if (inst->src[i].reg_offset != 0 && 1468 inst->src[i].file == VGRF && 1469 alloc.sizes[inst->src[i].nr] != 1) 1470 fprintf(file, ".%d", inst->src[i].reg_offset); 1471 1472 if (inst->src[i].file != IMM) { 1473 static const char *chans[4] = {"x", "y", "z", "w"}; 1474 fprintf(file, "."); 1475 for (int c = 0; c < 4; c++) { 1476 fprintf(file, "%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]); 1477 } 1478 } 1479 1480 if (inst->src[i].abs) 1481 fprintf(file, "|"); 1482 1483 if (inst->src[i].file != IMM) { 1484 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type)); 1485 } 1486 1487 if (i < 2 && inst->src[i + 1].file != BAD_FILE) 1488 fprintf(file, ", "); 1489 } 1490 1491 if (inst->force_writemask_all) 1492 fprintf(file, " NoMask"); 1493 1494 fprintf(file, "\n"); 1495} 1496 1497 1498static inline struct brw_reg 1499attribute_to_hw_reg(int attr, bool interleaved) 1500{ 1501 if (interleaved) 1502 return stride(brw_vec4_grf(attr / 2, (attr % 2) * 4), 0, 4, 1); 1503 else 1504 return brw_vec8_grf(attr, 0); 1505} 1506 1507 1508/** 1509 * Replace each register of type ATTR in this->instructions with a reference 1510 * to a fixed HW register. 1511 * 1512 * If interleaved is true, then each attribute takes up half a register, with 1513 * register N containing attribute 2*N in its first half and attribute 2*N+1 1514 * in its second half (this corresponds to the payload setup used by geometry 1515 * shaders in "single" or "dual instanced" dispatch mode). If interleaved is 1516 * false, then each attribute takes up a whole register, with register N 1517 * containing attribute N (this corresponds to the payload setup used by 1518 * vertex shaders, and by geometry shaders in "dual object" dispatch mode). 1519 */ 1520void 1521vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map, 1522 bool interleaved) 1523{ 1524 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 1525 /* We have to support ATTR as a destination for GL_FIXED fixup. */ 1526 if (inst->dst.file == ATTR) { 1527 int grf = attribute_map[inst->dst.nr + inst->dst.reg_offset]; 1528 1529 /* All attributes used in the shader need to have been assigned a 1530 * hardware register by the caller 1531 */ 1532 assert(grf != 0); 1533 1534 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved); 1535 reg.type = inst->dst.type; 1536 reg.writemask = inst->dst.writemask; 1537 1538 inst->dst = reg; 1539 } 1540 1541 for (int i = 0; i < 3; i++) { 1542 if (inst->src[i].file != ATTR) 1543 continue; 1544 1545 int grf = attribute_map[inst->src[i].nr + inst->src[i].reg_offset]; 1546 1547 /* All attributes used in the shader need to have been assigned a 1548 * hardware register by the caller 1549 */ 1550 assert(grf != 0); 1551 1552 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved); 1553 reg.swizzle = inst->src[i].swizzle; 1554 reg.type = inst->src[i].type; 1555 if (inst->src[i].abs) 1556 reg = brw_abs(reg); 1557 if (inst->src[i].negate) 1558 reg = negate(reg); 1559 1560 inst->src[i] = reg; 1561 } 1562 } 1563} 1564 1565int 1566vec4_vs_visitor::setup_attributes(int payload_reg) 1567{ 1568 int nr_attributes; 1569 int attribute_map[VERT_ATTRIB_MAX + 2]; 1570 memset(attribute_map, 0, sizeof(attribute_map)); 1571 1572 nr_attributes = 0; 1573 for (int i = 0; i < VERT_ATTRIB_MAX; i++) { 1574 if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) { 1575 attribute_map[i] = payload_reg + nr_attributes; 1576 nr_attributes++; 1577 } 1578 } 1579 1580 if (vs_prog_data->uses_drawid) { 1581 attribute_map[VERT_ATTRIB_MAX + 1] = payload_reg + nr_attributes; 1582 nr_attributes++; 1583 } 1584 1585 /* VertexID is stored by the VF as the last vertex element, but we 1586 * don't represent it with a flag in inputs_read, so we call it 1587 * VERT_ATTRIB_MAX. 1588 */ 1589 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid || 1590 vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) { 1591 attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes; 1592 nr_attributes++; 1593 } 1594 1595 lower_attributes_to_hw_regs(attribute_map, false /* interleaved */); 1596 1597 return payload_reg + vs_prog_data->nr_attributes; 1598} 1599 1600int 1601vec4_visitor::setup_uniforms(int reg) 1602{ 1603 prog_data->base.dispatch_grf_start_reg = reg; 1604 1605 /* The pre-gen6 VS requires that some push constants get loaded no 1606 * matter what, or the GPU would hang. 1607 */ 1608 if (devinfo->gen < 6 && this->uniforms == 0) { 1609 assert(this->uniforms < this->uniform_array_size); 1610 1611 stage_prog_data->param = 1612 reralloc(NULL, stage_prog_data->param, const gl_constant_value *, 4); 1613 for (unsigned int i = 0; i < 4; i++) { 1614 unsigned int slot = this->uniforms * 4 + i; 1615 static gl_constant_value zero = { 0.0 }; 1616 stage_prog_data->param[slot] = &zero; 1617 } 1618 1619 this->uniforms++; 1620 reg++; 1621 } else { 1622 reg += ALIGN(uniforms, 2) / 2; 1623 } 1624 1625 stage_prog_data->nr_params = this->uniforms * 4; 1626 1627 prog_data->base.curb_read_length = 1628 reg - prog_data->base.dispatch_grf_start_reg; 1629 1630 return reg; 1631} 1632 1633void 1634vec4_vs_visitor::setup_payload(void) 1635{ 1636 int reg = 0; 1637 1638 /* The payload always contains important data in g0, which contains 1639 * the URB handles that are passed on to the URB write at the end 1640 * of the thread. So, we always start push constants at g1. 1641 */ 1642 reg++; 1643 1644 reg = setup_uniforms(reg); 1645 1646 reg = setup_attributes(reg); 1647 1648 this->first_non_payload_grf = reg; 1649} 1650 1651src_reg 1652vec4_visitor::get_timestamp() 1653{ 1654 assert(devinfo->gen >= 7); 1655 1656 src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE, 1657 BRW_ARF_TIMESTAMP, 1658 0, 1659 0, 1660 0, 1661 BRW_REGISTER_TYPE_UD, 1662 BRW_VERTICAL_STRIDE_0, 1663 BRW_WIDTH_4, 1664 BRW_HORIZONTAL_STRIDE_4, 1665 BRW_SWIZZLE_XYZW, 1666 WRITEMASK_XYZW)); 1667 1668 dst_reg dst = dst_reg(this, glsl_type::uvec4_type); 1669 1670 vec4_instruction *mov = emit(MOV(dst, ts)); 1671 /* We want to read the 3 fields we care about (mostly field 0, but also 2) 1672 * even if it's not enabled in the dispatch. 1673 */ 1674 mov->force_writemask_all = true; 1675 1676 return src_reg(dst); 1677} 1678 1679void 1680vec4_visitor::emit_shader_time_begin() 1681{ 1682 current_annotation = "shader time start"; 1683 shader_start_time = get_timestamp(); 1684} 1685 1686void 1687vec4_visitor::emit_shader_time_end() 1688{ 1689 current_annotation = "shader time end"; 1690 src_reg shader_end_time = get_timestamp(); 1691 1692 1693 /* Check that there weren't any timestamp reset events (assuming these 1694 * were the only two timestamp reads that happened). 1695 */ 1696 src_reg reset_end = shader_end_time; 1697 reset_end.swizzle = BRW_SWIZZLE_ZZZZ; 1698 vec4_instruction *test = emit(AND(dst_null_ud(), reset_end, brw_imm_ud(1u))); 1699 test->conditional_mod = BRW_CONDITIONAL_Z; 1700 1701 emit(IF(BRW_PREDICATE_NORMAL)); 1702 1703 /* Take the current timestamp and get the delta. */ 1704 shader_start_time.negate = true; 1705 dst_reg diff = dst_reg(this, glsl_type::uint_type); 1706 emit(ADD(diff, shader_start_time, shader_end_time)); 1707 1708 /* If there were no instructions between the two timestamp gets, the diff 1709 * is 2 cycles. Remove that overhead, so I can forget about that when 1710 * trying to determine the time taken for single instructions. 1711 */ 1712 emit(ADD(diff, src_reg(diff), brw_imm_ud(-2u))); 1713 1714 emit_shader_time_write(0, src_reg(diff)); 1715 emit_shader_time_write(1, brw_imm_ud(1u)); 1716 emit(BRW_OPCODE_ELSE); 1717 emit_shader_time_write(2, brw_imm_ud(1u)); 1718 emit(BRW_OPCODE_ENDIF); 1719} 1720 1721void 1722vec4_visitor::emit_shader_time_write(int shader_time_subindex, src_reg value) 1723{ 1724 dst_reg dst = 1725 dst_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 2)); 1726 1727 dst_reg offset = dst; 1728 dst_reg time = dst; 1729 time.reg_offset++; 1730 1731 offset.type = BRW_REGISTER_TYPE_UD; 1732 int index = shader_time_index * 3 + shader_time_subindex; 1733 emit(MOV(offset, brw_imm_d(index * SHADER_TIME_STRIDE))); 1734 1735 time.type = BRW_REGISTER_TYPE_UD; 1736 emit(MOV(time, value)); 1737 1738 vec4_instruction *inst = 1739 emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(dst)); 1740 inst->mlen = 2; 1741} 1742 1743void 1744vec4_visitor::convert_to_hw_regs() 1745{ 1746 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 1747 for (int i = 0; i < 3; i++) { 1748 struct src_reg &src = inst->src[i]; 1749 struct brw_reg reg; 1750 switch (src.file) { 1751 case VGRF: 1752 reg = brw_vec8_grf(src.nr + src.reg_offset, 0); 1753 reg.type = src.type; 1754 reg.swizzle = src.swizzle; 1755 reg.abs = src.abs; 1756 reg.negate = src.negate; 1757 break; 1758 1759 case UNIFORM: 1760 reg = stride(brw_vec4_grf(prog_data->base.dispatch_grf_start_reg + 1761 (src.nr + src.reg_offset) / 2, 1762 ((src.nr + src.reg_offset) % 2) * 4), 1763 0, 4, 1); 1764 reg.type = src.type; 1765 reg.swizzle = src.swizzle; 1766 reg.abs = src.abs; 1767 reg.negate = src.negate; 1768 1769 /* This should have been moved to pull constants. */ 1770 assert(!src.reladdr); 1771 break; 1772 1773 case ARF: 1774 case FIXED_GRF: 1775 case IMM: 1776 continue; 1777 1778 case BAD_FILE: 1779 /* Probably unused. */ 1780 reg = brw_null_reg(); 1781 break; 1782 1783 case MRF: 1784 case ATTR: 1785 unreachable("not reached"); 1786 } 1787 1788 src = reg; 1789 } 1790 1791 if (inst->is_3src()) { 1792 /* 3-src instructions with scalar sources support arbitrary subnr, 1793 * but don't actually use swizzles. Convert swizzle into subnr. 1794 */ 1795 for (int i = 0; i < 3; i++) { 1796 if (inst->src[i].vstride == BRW_VERTICAL_STRIDE_0) { 1797 assert(brw_is_single_value_swizzle(inst->src[i].swizzle)); 1798 inst->src[i].subnr += 4 * BRW_GET_SWZ(inst->src[i].swizzle, 0); 1799 } 1800 } 1801 } 1802 1803 dst_reg &dst = inst->dst; 1804 struct brw_reg reg; 1805 1806 switch (inst->dst.file) { 1807 case VGRF: 1808 reg = brw_vec8_grf(dst.nr + dst.reg_offset, 0); 1809 reg.type = dst.type; 1810 reg.writemask = dst.writemask; 1811 break; 1812 1813 case MRF: 1814 assert(((dst.nr + dst.reg_offset) & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen)); 1815 reg = brw_message_reg(dst.nr + dst.reg_offset); 1816 reg.type = dst.type; 1817 reg.writemask = dst.writemask; 1818 break; 1819 1820 case ARF: 1821 case FIXED_GRF: 1822 reg = dst.as_brw_reg(); 1823 break; 1824 1825 case BAD_FILE: 1826 reg = brw_null_reg(); 1827 break; 1828 1829 case IMM: 1830 case ATTR: 1831 case UNIFORM: 1832 unreachable("not reached"); 1833 } 1834 1835 dst = reg; 1836 } 1837} 1838 1839bool 1840vec4_visitor::run() 1841{ 1842 if (shader_time_index >= 0) 1843 emit_shader_time_begin(); 1844 1845 emit_prolog(); 1846 1847 emit_nir_code(); 1848 if (failed) 1849 return false; 1850 base_ir = NULL; 1851 1852 emit_thread_end(); 1853 1854 calculate_cfg(); 1855 1856 /* Before any optimization, push array accesses out to scratch 1857 * space where we need them to be. This pass may allocate new 1858 * virtual GRFs, so we want to do it early. It also makes sure 1859 * that we have reladdr computations available for CSE, since we'll 1860 * often do repeated subexpressions for those. 1861 */ 1862 move_grf_array_access_to_scratch(); 1863 move_uniform_array_access_to_pull_constants(); 1864 1865 pack_uniform_registers(); 1866 move_push_constants_to_pull_constants(); 1867 split_virtual_grfs(); 1868 1869#define OPT(pass, args...) ({ \ 1870 pass_num++; \ 1871 bool this_progress = pass(args); \ 1872 \ 1873 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \ 1874 char filename[64]; \ 1875 snprintf(filename, 64, "%s-%s-%02d-%02d-" #pass, \ 1876 stage_abbrev, nir->info.name, iteration, pass_num); \ 1877 \ 1878 backend_shader::dump_instructions(filename); \ 1879 } \ 1880 \ 1881 progress = progress || this_progress; \ 1882 this_progress; \ 1883 }) 1884 1885 1886 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) { 1887 char filename[64]; 1888 snprintf(filename, 64, "%s-%s-00-start", 1889 stage_abbrev, nir->info.name); 1890 1891 backend_shader::dump_instructions(filename); 1892 } 1893 1894 bool progress; 1895 int iteration = 0; 1896 int pass_num = 0; 1897 do { 1898 progress = false; 1899 pass_num = 0; 1900 iteration++; 1901 1902 OPT(opt_predicated_break, this); 1903 OPT(opt_reduce_swizzle); 1904 OPT(dead_code_eliminate); 1905 OPT(dead_control_flow_eliminate, this); 1906 OPT(opt_copy_propagation); 1907 OPT(opt_cmod_propagation); 1908 OPT(opt_cse); 1909 OPT(opt_algebraic); 1910 OPT(opt_register_coalesce); 1911 OPT(eliminate_find_live_channel); 1912 } while (progress); 1913 1914 pass_num = 0; 1915 1916 if (OPT(opt_vector_float)) { 1917 OPT(opt_cse); 1918 OPT(opt_copy_propagation, false); 1919 OPT(opt_copy_propagation, true); 1920 OPT(dead_code_eliminate); 1921 } 1922 1923 if (failed) 1924 return false; 1925 1926 setup_payload(); 1927 1928 if (unlikely(INTEL_DEBUG & DEBUG_SPILL_VEC4)) { 1929 /* Debug of register spilling: Go spill everything. */ 1930 const int grf_count = alloc.count; 1931 float spill_costs[alloc.count]; 1932 bool no_spill[alloc.count]; 1933 evaluate_spill_costs(spill_costs, no_spill); 1934 for (int i = 0; i < grf_count; i++) { 1935 if (no_spill[i]) 1936 continue; 1937 spill_reg(i); 1938 } 1939 } 1940 1941 bool allocated_without_spills = reg_allocate(); 1942 1943 if (!allocated_without_spills) { 1944 compiler->shader_perf_log(log_data, 1945 "%s shader triggered register spilling. " 1946 "Try reducing the number of live vec4 values " 1947 "to improve performance.\n", 1948 stage_name); 1949 1950 while (!reg_allocate()) { 1951 if (failed) 1952 return false; 1953 } 1954 } 1955 1956 opt_schedule_instructions(); 1957 1958 opt_set_dependency_control(); 1959 1960 convert_to_hw_regs(); 1961 1962 if (last_scratch > 0) { 1963 prog_data->base.total_scratch = 1964 brw_get_scratch_size(last_scratch * REG_SIZE); 1965 } 1966 1967 return !failed; 1968} 1969 1970} /* namespace brw */ 1971 1972extern "C" { 1973 1974/** 1975 * Compile a vertex shader. 1976 * 1977 * Returns the final assembly and the program's size. 1978 */ 1979const unsigned * 1980brw_compile_vs(const struct brw_compiler *compiler, void *log_data, 1981 void *mem_ctx, 1982 const struct brw_vs_prog_key *key, 1983 struct brw_vs_prog_data *prog_data, 1984 const nir_shader *src_shader, 1985 gl_clip_plane *clip_planes, 1986 bool use_legacy_snorm_formula, 1987 int shader_time_index, 1988 unsigned *final_assembly_size, 1989 char **error_str) 1990{ 1991 const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX]; 1992 nir_shader *shader = nir_shader_clone(mem_ctx, src_shader); 1993 shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex, 1994 is_scalar); 1995 shader = brw_postprocess_nir(shader, compiler->devinfo, is_scalar); 1996 1997 const unsigned *assembly = NULL; 1998 1999 unsigned nr_attributes = _mesa_bitcount_64(prog_data->inputs_read); 2000 2001 /* gl_VertexID and gl_InstanceID are system values, but arrive via an 2002 * incoming vertex attribute. So, add an extra slot. 2003 */ 2004 if (shader->info.system_values_read & 2005 (BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX) | 2006 BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) | 2007 BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) | 2008 BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) { 2009 nr_attributes++; 2010 } 2011 2012 /* gl_DrawID has its very own vec4 */ 2013 if (shader->info.system_values_read & BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID)) { 2014 nr_attributes++; 2015 } 2016 2017 /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry 2018 * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, in 2019 * vec4 mode, the hardware appears to wedge unless we read something. 2020 */ 2021 if (is_scalar) 2022 prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attributes, 2); 2023 else 2024 prog_data->base.urb_read_length = DIV_ROUND_UP(MAX2(nr_attributes, 1), 2); 2025 2026 prog_data->nr_attributes = nr_attributes; 2027 2028 /* Since vertex shaders reuse the same VUE entry for inputs and outputs 2029 * (overwriting the original contents), we need to make sure the size is 2030 * the larger of the two. 2031 */ 2032 const unsigned vue_entries = 2033 MAX2(nr_attributes, (unsigned)prog_data->base.vue_map.num_slots); 2034 2035 if (compiler->devinfo->gen == 6) 2036 prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8); 2037 else 2038 prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4); 2039 2040 if (is_scalar) { 2041 prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8; 2042 2043 fs_visitor v(compiler, log_data, mem_ctx, key, &prog_data->base.base, 2044 NULL, /* prog; Only used for TEXTURE_RECTANGLE on gen < 8 */ 2045 shader, 8, shader_time_index); 2046 if (!v.run_vs(clip_planes)) { 2047 if (error_str) 2048 *error_str = ralloc_strdup(mem_ctx, v.fail_msg); 2049 2050 return NULL; 2051 } 2052 2053 fs_generator g(compiler, log_data, mem_ctx, (void *) key, 2054 &prog_data->base.base, v.promoted_constants, 2055 v.runtime_check_aads_emit, MESA_SHADER_VERTEX); 2056 if (INTEL_DEBUG & DEBUG_VS) { 2057 const char *debug_name = 2058 ralloc_asprintf(mem_ctx, "%s vertex shader %s", 2059 shader->info.label ? shader->info.label : "unnamed", 2060 shader->info.name); 2061 2062 g.enable_debug(debug_name); 2063 } 2064 g.generate_code(v.cfg, 8); 2065 assembly = g.get_assembly(final_assembly_size); 2066 } 2067 2068 if (!assembly) { 2069 prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT; 2070 2071 vec4_vs_visitor v(compiler, log_data, key, prog_data, 2072 shader, clip_planes, mem_ctx, 2073 shader_time_index, use_legacy_snorm_formula); 2074 if (!v.run()) { 2075 if (error_str) 2076 *error_str = ralloc_strdup(mem_ctx, v.fail_msg); 2077 2078 return NULL; 2079 } 2080 2081 assembly = brw_vec4_generate_assembly(compiler, log_data, mem_ctx, 2082 shader, &prog_data->base, v.cfg, 2083 final_assembly_size); 2084 } 2085 2086 return assembly; 2087} 2088 2089} /* extern "C" */ 2090