brw_vec4.cpp revision 0b91bcea98c0fe201bba89abe1ca3aee4d04c56c
1/* 2 * Copyright © 2011 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "brw_vec4.h" 25#include "brw_fs.h" 26#include "brw_cfg.h" 27#include "brw_vs.h" 28#include "brw_nir.h" 29#include "brw_vec4_live_variables.h" 30#include "brw_dead_control_flow.h" 31 32extern "C" { 33#include "main/macros.h" 34#include "main/shaderobj.h" 35#include "program/prog_print.h" 36#include "program/prog_parameter.h" 37} 38#include "main/context.h" 39 40#define MAX_INSTRUCTION (1 << 30) 41 42using namespace brw; 43 44namespace brw { 45 46void 47src_reg::init() 48{ 49 memset(this, 0, sizeof(*this)); 50 51 this->file = BAD_FILE; 52} 53 54src_reg::src_reg(register_file file, int reg, const glsl_type *type) 55{ 56 init(); 57 58 this->file = file; 59 this->reg = reg; 60 if (type && (type->is_scalar() || type->is_vector() || type->is_matrix())) 61 this->swizzle = brw_swizzle_for_size(type->vector_elements); 62 else 63 this->swizzle = BRW_SWIZZLE_XYZW; 64 if (type) 65 this->type = brw_type_for_base_type(type); 66} 67 68/** Generic unset register constructor. */ 69src_reg::src_reg() 70{ 71 init(); 72} 73 74src_reg::src_reg(float f) 75{ 76 init(); 77 78 this->file = IMM; 79 this->type = BRW_REGISTER_TYPE_F; 80 this->fixed_hw_reg.dw1.f = f; 81} 82 83src_reg::src_reg(uint32_t u) 84{ 85 init(); 86 87 this->file = IMM; 88 this->type = BRW_REGISTER_TYPE_UD; 89 this->fixed_hw_reg.dw1.ud = u; 90} 91 92src_reg::src_reg(int32_t i) 93{ 94 init(); 95 96 this->file = IMM; 97 this->type = BRW_REGISTER_TYPE_D; 98 this->fixed_hw_reg.dw1.d = i; 99} 100 101src_reg::src_reg(uint8_t vf[4]) 102{ 103 init(); 104 105 this->file = IMM; 106 this->type = BRW_REGISTER_TYPE_VF; 107 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned)); 108} 109 110src_reg::src_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3) 111{ 112 init(); 113 114 this->file = IMM; 115 this->type = BRW_REGISTER_TYPE_VF; 116 this->fixed_hw_reg.dw1.ud = (vf0 << 0) | 117 (vf1 << 8) | 118 (vf2 << 16) | 119 (vf3 << 24); 120} 121 122src_reg::src_reg(struct brw_reg reg) 123{ 124 init(); 125 126 this->file = HW_REG; 127 this->fixed_hw_reg = reg; 128 this->type = reg.type; 129} 130 131src_reg::src_reg(const dst_reg ®) 132{ 133 init(); 134 135 this->file = reg.file; 136 this->reg = reg.reg; 137 this->reg_offset = reg.reg_offset; 138 this->type = reg.type; 139 this->reladdr = reg.reladdr; 140 this->fixed_hw_reg = reg.fixed_hw_reg; 141 this->swizzle = brw_swizzle_for_mask(reg.writemask); 142} 143 144void 145dst_reg::init() 146{ 147 memset(this, 0, sizeof(*this)); 148 this->file = BAD_FILE; 149 this->writemask = WRITEMASK_XYZW; 150} 151 152dst_reg::dst_reg() 153{ 154 init(); 155} 156 157dst_reg::dst_reg(register_file file, int reg) 158{ 159 init(); 160 161 this->file = file; 162 this->reg = reg; 163} 164 165dst_reg::dst_reg(register_file file, int reg, const glsl_type *type, 166 unsigned writemask) 167{ 168 init(); 169 170 this->file = file; 171 this->reg = reg; 172 this->type = brw_type_for_base_type(type); 173 this->writemask = writemask; 174} 175 176dst_reg::dst_reg(register_file file, int reg, brw_reg_type type, 177 unsigned writemask) 178{ 179 init(); 180 181 this->file = file; 182 this->reg = reg; 183 this->type = type; 184 this->writemask = writemask; 185} 186 187dst_reg::dst_reg(struct brw_reg reg) 188{ 189 init(); 190 191 this->file = HW_REG; 192 this->fixed_hw_reg = reg; 193 this->type = reg.type; 194} 195 196dst_reg::dst_reg(const src_reg ®) 197{ 198 init(); 199 200 this->file = reg.file; 201 this->reg = reg.reg; 202 this->reg_offset = reg.reg_offset; 203 this->type = reg.type; 204 this->writemask = brw_mask_for_swizzle(reg.swizzle); 205 this->reladdr = reg.reladdr; 206 this->fixed_hw_reg = reg.fixed_hw_reg; 207} 208 209bool 210dst_reg::equals(const dst_reg &r) const 211{ 212 return (file == r.file && 213 reg == r.reg && 214 reg_offset == r.reg_offset && 215 type == r.type && 216 negate == r.negate && 217 abs == r.abs && 218 writemask == r.writemask && 219 (reladdr == r.reladdr || 220 (reladdr && r.reladdr && reladdr->equals(*r.reladdr))) && 221 ((file != HW_REG && file != IMM) || 222 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, 223 sizeof(fixed_hw_reg)) == 0)); 224} 225 226bool 227vec4_instruction::is_send_from_grf() 228{ 229 switch (opcode) { 230 case SHADER_OPCODE_SHADER_TIME_ADD: 231 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: 232 case SHADER_OPCODE_UNTYPED_ATOMIC: 233 case SHADER_OPCODE_UNTYPED_SURFACE_READ: 234 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: 235 case SHADER_OPCODE_TYPED_ATOMIC: 236 case SHADER_OPCODE_TYPED_SURFACE_READ: 237 case SHADER_OPCODE_TYPED_SURFACE_WRITE: 238 return true; 239 default: 240 return false; 241 } 242} 243 244unsigned 245vec4_instruction::regs_read(unsigned arg) const 246{ 247 if (src[arg].file == BAD_FILE) 248 return 0; 249 250 switch (opcode) { 251 case SHADER_OPCODE_SHADER_TIME_ADD: 252 case SHADER_OPCODE_UNTYPED_ATOMIC: 253 case SHADER_OPCODE_UNTYPED_SURFACE_READ: 254 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: 255 case SHADER_OPCODE_TYPED_ATOMIC: 256 case SHADER_OPCODE_TYPED_SURFACE_READ: 257 case SHADER_OPCODE_TYPED_SURFACE_WRITE: 258 return arg == 0 ? mlen : 1; 259 260 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: 261 return arg == 1 ? mlen : 1; 262 263 default: 264 return 1; 265 } 266} 267 268bool 269vec4_instruction::can_do_source_mods(const struct brw_device_info *devinfo) 270{ 271 if (devinfo->gen == 6 && is_math()) 272 return false; 273 274 if (is_send_from_grf()) 275 return false; 276 277 if (!backend_instruction::can_do_source_mods()) 278 return false; 279 280 return true; 281} 282 283/** 284 * Returns how many MRFs an opcode will write over. 285 * 286 * Note that this is not the 0 or 1 implied writes in an actual gen 287 * instruction -- the generate_* functions generate additional MOVs 288 * for setup. 289 */ 290int 291vec4_visitor::implied_mrf_writes(vec4_instruction *inst) 292{ 293 if (inst->mlen == 0 || inst->is_send_from_grf()) 294 return 0; 295 296 switch (inst->opcode) { 297 case SHADER_OPCODE_RCP: 298 case SHADER_OPCODE_RSQ: 299 case SHADER_OPCODE_SQRT: 300 case SHADER_OPCODE_EXP2: 301 case SHADER_OPCODE_LOG2: 302 case SHADER_OPCODE_SIN: 303 case SHADER_OPCODE_COS: 304 return 1; 305 case SHADER_OPCODE_INT_QUOTIENT: 306 case SHADER_OPCODE_INT_REMAINDER: 307 case SHADER_OPCODE_POW: 308 return 2; 309 case VS_OPCODE_URB_WRITE: 310 return 1; 311 case VS_OPCODE_PULL_CONSTANT_LOAD: 312 return 2; 313 case SHADER_OPCODE_GEN4_SCRATCH_READ: 314 return 2; 315 case SHADER_OPCODE_GEN4_SCRATCH_WRITE: 316 return 3; 317 case GS_OPCODE_URB_WRITE: 318 case GS_OPCODE_URB_WRITE_ALLOCATE: 319 case GS_OPCODE_THREAD_END: 320 return 0; 321 case GS_OPCODE_FF_SYNC: 322 return 1; 323 case SHADER_OPCODE_SHADER_TIME_ADD: 324 return 0; 325 case SHADER_OPCODE_TEX: 326 case SHADER_OPCODE_TXL: 327 case SHADER_OPCODE_TXD: 328 case SHADER_OPCODE_TXF: 329 case SHADER_OPCODE_TXF_CMS: 330 case SHADER_OPCODE_TXF_MCS: 331 case SHADER_OPCODE_TXS: 332 case SHADER_OPCODE_TG4: 333 case SHADER_OPCODE_TG4_OFFSET: 334 case SHADER_OPCODE_SAMPLEINFO: 335 return inst->header_size; 336 default: 337 unreachable("not reached"); 338 } 339} 340 341bool 342src_reg::equals(const src_reg &r) const 343{ 344 return (file == r.file && 345 reg == r.reg && 346 reg_offset == r.reg_offset && 347 type == r.type && 348 negate == r.negate && 349 abs == r.abs && 350 swizzle == r.swizzle && 351 !reladdr && !r.reladdr && 352 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, 353 sizeof(fixed_hw_reg)) == 0); 354} 355 356bool 357vec4_visitor::opt_vector_float() 358{ 359 bool progress = false; 360 361 int last_reg = -1, last_reg_offset = -1; 362 enum register_file last_reg_file = BAD_FILE; 363 364 int remaining_channels = 0; 365 uint8_t imm[4]; 366 int inst_count = 0; 367 vec4_instruction *imm_inst[4]; 368 369 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { 370 if (last_reg != inst->dst.reg || 371 last_reg_offset != inst->dst.reg_offset || 372 last_reg_file != inst->dst.file) { 373 last_reg = inst->dst.reg; 374 last_reg_offset = inst->dst.reg_offset; 375 last_reg_file = inst->dst.file; 376 remaining_channels = WRITEMASK_XYZW; 377 378 inst_count = 0; 379 } 380 381 if (inst->opcode != BRW_OPCODE_MOV || 382 inst->dst.writemask == WRITEMASK_XYZW || 383 inst->src[0].file != IMM) 384 continue; 385 386 int vf = brw_float_to_vf(inst->src[0].fixed_hw_reg.dw1.f); 387 if (vf == -1) 388 continue; 389 390 if ((inst->dst.writemask & WRITEMASK_X) != 0) 391 imm[0] = vf; 392 if ((inst->dst.writemask & WRITEMASK_Y) != 0) 393 imm[1] = vf; 394 if ((inst->dst.writemask & WRITEMASK_Z) != 0) 395 imm[2] = vf; 396 if ((inst->dst.writemask & WRITEMASK_W) != 0) 397 imm[3] = vf; 398 399 imm_inst[inst_count++] = inst; 400 401 remaining_channels &= ~inst->dst.writemask; 402 if (remaining_channels == 0) { 403 vec4_instruction *mov = MOV(inst->dst, imm); 404 mov->dst.type = BRW_REGISTER_TYPE_F; 405 mov->dst.writemask = WRITEMASK_XYZW; 406 inst->insert_after(block, mov); 407 last_reg = -1; 408 409 for (int i = 0; i < inst_count; i++) { 410 imm_inst[i]->remove(block); 411 } 412 progress = true; 413 } 414 } 415 416 if (progress) 417 invalidate_live_intervals(); 418 419 return progress; 420} 421 422/* Replaces unused channels of a swizzle with channels that are used. 423 * 424 * For instance, this pass transforms 425 * 426 * mov vgrf4.yz, vgrf5.wxzy 427 * 428 * into 429 * 430 * mov vgrf4.yz, vgrf5.xxzx 431 * 432 * This eliminates false uses of some channels, letting dead code elimination 433 * remove the instructions that wrote them. 434 */ 435bool 436vec4_visitor::opt_reduce_swizzle() 437{ 438 bool progress = false; 439 440 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { 441 if (inst->dst.file == BAD_FILE || inst->dst.file == HW_REG || 442 inst->is_send_from_grf()) 443 continue; 444 445 unsigned swizzle; 446 447 /* Determine which channels of the sources are read. */ 448 switch (inst->opcode) { 449 case VEC4_OPCODE_PACK_BYTES: 450 case BRW_OPCODE_DP4: 451 case BRW_OPCODE_DPH: /* FINISHME: DPH reads only three channels of src0, 452 * but all four of src1. 453 */ 454 swizzle = brw_swizzle_for_size(4); 455 break; 456 case BRW_OPCODE_DP3: 457 swizzle = brw_swizzle_for_size(3); 458 break; 459 case BRW_OPCODE_DP2: 460 swizzle = brw_swizzle_for_size(2); 461 break; 462 default: 463 swizzle = brw_swizzle_for_mask(inst->dst.writemask); 464 break; 465 } 466 467 /* Update sources' swizzles. */ 468 for (int i = 0; i < 3; i++) { 469 if (inst->src[i].file != GRF && 470 inst->src[i].file != ATTR && 471 inst->src[i].file != UNIFORM) 472 continue; 473 474 const unsigned new_swizzle = 475 brw_compose_swizzle(swizzle, inst->src[i].swizzle); 476 if (inst->src[i].swizzle != new_swizzle) { 477 inst->src[i].swizzle = new_swizzle; 478 progress = true; 479 } 480 } 481 } 482 483 if (progress) 484 invalidate_live_intervals(); 485 486 return progress; 487} 488 489void 490vec4_visitor::split_uniform_registers() 491{ 492 /* Prior to this, uniforms have been in an array sized according to 493 * the number of vector uniforms present, sparsely filled (so an 494 * aggregate results in reg indices being skipped over). Now we're 495 * going to cut those aggregates up so each .reg index is one 496 * vector. The goal is to make elimination of unused uniform 497 * components easier later. 498 */ 499 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 500 for (int i = 0 ; i < 3; i++) { 501 if (inst->src[i].file != UNIFORM) 502 continue; 503 504 assert(!inst->src[i].reladdr); 505 506 inst->src[i].reg += inst->src[i].reg_offset; 507 inst->src[i].reg_offset = 0; 508 } 509 } 510 511 /* Update that everything is now vector-sized. */ 512 for (int i = 0; i < this->uniforms; i++) { 513 this->uniform_size[i] = 1; 514 } 515} 516 517void 518vec4_visitor::pack_uniform_registers() 519{ 520 bool uniform_used[this->uniforms]; 521 int new_loc[this->uniforms]; 522 int new_chan[this->uniforms]; 523 524 memset(uniform_used, 0, sizeof(uniform_used)); 525 memset(new_loc, 0, sizeof(new_loc)); 526 memset(new_chan, 0, sizeof(new_chan)); 527 528 /* Find which uniform vectors are actually used by the program. We 529 * expect unused vector elements when we've moved array access out 530 * to pull constants, and from some GLSL code generators like wine. 531 */ 532 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 533 for (int i = 0 ; i < 3; i++) { 534 if (inst->src[i].file != UNIFORM) 535 continue; 536 537 uniform_used[inst->src[i].reg] = true; 538 } 539 } 540 541 int new_uniform_count = 0; 542 543 /* Now, figure out a packing of the live uniform vectors into our 544 * push constants. 545 */ 546 for (int src = 0; src < uniforms; src++) { 547 assert(src < uniform_array_size); 548 int size = this->uniform_vector_size[src]; 549 550 if (!uniform_used[src]) { 551 this->uniform_vector_size[src] = 0; 552 continue; 553 } 554 555 int dst; 556 /* Find the lowest place we can slot this uniform in. */ 557 for (dst = 0; dst < src; dst++) { 558 if (this->uniform_vector_size[dst] + size <= 4) 559 break; 560 } 561 562 if (src == dst) { 563 new_loc[src] = dst; 564 new_chan[src] = 0; 565 } else { 566 new_loc[src] = dst; 567 new_chan[src] = this->uniform_vector_size[dst]; 568 569 /* Move the references to the data */ 570 for (int j = 0; j < size; j++) { 571 stage_prog_data->param[dst * 4 + new_chan[src] + j] = 572 stage_prog_data->param[src * 4 + j]; 573 } 574 575 this->uniform_vector_size[dst] += size; 576 this->uniform_vector_size[src] = 0; 577 } 578 579 new_uniform_count = MAX2(new_uniform_count, dst + 1); 580 } 581 582 this->uniforms = new_uniform_count; 583 584 /* Now, update the instructions for our repacked uniforms. */ 585 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 586 for (int i = 0 ; i < 3; i++) { 587 int src = inst->src[i].reg; 588 589 if (inst->src[i].file != UNIFORM) 590 continue; 591 592 inst->src[i].reg = new_loc[src]; 593 inst->src[i].swizzle += BRW_SWIZZLE4(new_chan[src], new_chan[src], 594 new_chan[src], new_chan[src]); 595 } 596 } 597} 598 599/** 600 * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a). 601 * 602 * While GLSL IR also performs this optimization, we end up with it in 603 * our instruction stream for a couple of reasons. One is that we 604 * sometimes generate silly instructions, for example in array access 605 * where we'll generate "ADD offset, index, base" even if base is 0. 606 * The other is that GLSL IR's constant propagation doesn't track the 607 * components of aggregates, so some VS patterns (initialize matrix to 608 * 0, accumulate in vertex blending factors) end up breaking down to 609 * instructions involving 0. 610 */ 611bool 612vec4_visitor::opt_algebraic() 613{ 614 bool progress = false; 615 616 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 617 switch (inst->opcode) { 618 case BRW_OPCODE_MOV: 619 if (inst->src[0].file != IMM) 620 break; 621 622 if (inst->saturate) { 623 if (inst->dst.type != inst->src[0].type) 624 assert(!"unimplemented: saturate mixed types"); 625 626 if (brw_saturate_immediate(inst->dst.type, 627 &inst->src[0].fixed_hw_reg)) { 628 inst->saturate = false; 629 progress = true; 630 } 631 } 632 break; 633 634 case VEC4_OPCODE_UNPACK_UNIFORM: 635 if (inst->src[0].file != UNIFORM) { 636 inst->opcode = BRW_OPCODE_MOV; 637 progress = true; 638 } 639 break; 640 641 case BRW_OPCODE_ADD: 642 if (inst->src[1].is_zero()) { 643 inst->opcode = BRW_OPCODE_MOV; 644 inst->src[1] = src_reg(); 645 progress = true; 646 } 647 break; 648 649 case BRW_OPCODE_MUL: 650 if (inst->src[1].is_zero()) { 651 inst->opcode = BRW_OPCODE_MOV; 652 switch (inst->src[0].type) { 653 case BRW_REGISTER_TYPE_F: 654 inst->src[0] = src_reg(0.0f); 655 break; 656 case BRW_REGISTER_TYPE_D: 657 inst->src[0] = src_reg(0); 658 break; 659 case BRW_REGISTER_TYPE_UD: 660 inst->src[0] = src_reg(0u); 661 break; 662 default: 663 unreachable("not reached"); 664 } 665 inst->src[1] = src_reg(); 666 progress = true; 667 } else if (inst->src[1].is_one()) { 668 inst->opcode = BRW_OPCODE_MOV; 669 inst->src[1] = src_reg(); 670 progress = true; 671 } else if (inst->src[1].is_negative_one()) { 672 inst->opcode = BRW_OPCODE_MOV; 673 inst->src[0].negate = !inst->src[0].negate; 674 inst->src[1] = src_reg(); 675 progress = true; 676 } 677 break; 678 case BRW_OPCODE_CMP: 679 if (inst->conditional_mod == BRW_CONDITIONAL_GE && 680 inst->src[0].abs && 681 inst->src[0].negate && 682 inst->src[1].is_zero()) { 683 inst->src[0].abs = false; 684 inst->src[0].negate = false; 685 inst->conditional_mod = BRW_CONDITIONAL_Z; 686 progress = true; 687 break; 688 } 689 break; 690 case SHADER_OPCODE_RCP: { 691 vec4_instruction *prev = (vec4_instruction *)inst->prev; 692 if (prev->opcode == SHADER_OPCODE_SQRT) { 693 if (inst->src[0].equals(src_reg(prev->dst))) { 694 inst->opcode = SHADER_OPCODE_RSQ; 695 inst->src[0] = prev->src[0]; 696 progress = true; 697 } 698 } 699 break; 700 } 701 case SHADER_OPCODE_BROADCAST: 702 if (is_uniform(inst->src[0]) || 703 inst->src[1].is_zero()) { 704 inst->opcode = BRW_OPCODE_MOV; 705 inst->src[1] = src_reg(); 706 inst->force_writemask_all = true; 707 progress = true; 708 } 709 break; 710 711 default: 712 break; 713 } 714 } 715 716 if (progress) 717 invalidate_live_intervals(); 718 719 return progress; 720} 721 722/** 723 * Only a limited number of hardware registers may be used for push 724 * constants, so this turns access to the overflowed constants into 725 * pull constants. 726 */ 727void 728vec4_visitor::move_push_constants_to_pull_constants() 729{ 730 int pull_constant_loc[this->uniforms]; 731 732 /* Only allow 32 registers (256 uniform components) as push constants, 733 * which is the limit on gen6. 734 * 735 * If changing this value, note the limitation about total_regs in 736 * brw_curbe.c. 737 */ 738 int max_uniform_components = 32 * 8; 739 if (this->uniforms * 4 <= max_uniform_components) 740 return; 741 742 /* Make some sort of choice as to which uniforms get sent to pull 743 * constants. We could potentially do something clever here like 744 * look for the most infrequently used uniform vec4s, but leave 745 * that for later. 746 */ 747 for (int i = 0; i < this->uniforms * 4; i += 4) { 748 pull_constant_loc[i / 4] = -1; 749 750 if (i >= max_uniform_components) { 751 const gl_constant_value **values = &stage_prog_data->param[i]; 752 753 /* Try to find an existing copy of this uniform in the pull 754 * constants if it was part of an array access already. 755 */ 756 for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j += 4) { 757 int matches; 758 759 for (matches = 0; matches < 4; matches++) { 760 if (stage_prog_data->pull_param[j + matches] != values[matches]) 761 break; 762 } 763 764 if (matches == 4) { 765 pull_constant_loc[i / 4] = j / 4; 766 break; 767 } 768 } 769 770 if (pull_constant_loc[i / 4] == -1) { 771 assert(stage_prog_data->nr_pull_params % 4 == 0); 772 pull_constant_loc[i / 4] = stage_prog_data->nr_pull_params / 4; 773 774 for (int j = 0; j < 4; j++) { 775 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] = 776 values[j]; 777 } 778 } 779 } 780 } 781 782 /* Now actually rewrite usage of the things we've moved to pull 783 * constants. 784 */ 785 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { 786 for (int i = 0 ; i < 3; i++) { 787 if (inst->src[i].file != UNIFORM || 788 pull_constant_loc[inst->src[i].reg] == -1) 789 continue; 790 791 int uniform = inst->src[i].reg; 792 793 dst_reg temp = dst_reg(this, glsl_type::vec4_type); 794 795 emit_pull_constant_load(block, inst, temp, inst->src[i], 796 pull_constant_loc[uniform]); 797 798 inst->src[i].file = temp.file; 799 inst->src[i].reg = temp.reg; 800 inst->src[i].reg_offset = temp.reg_offset; 801 inst->src[i].reladdr = NULL; 802 } 803 } 804 805 /* Repack push constants to remove the now-unused ones. */ 806 pack_uniform_registers(); 807} 808 809/* Conditions for which we want to avoid setting the dependency control bits */ 810bool 811vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst) 812{ 813#define IS_DWORD(reg) \ 814 (reg.type == BRW_REGISTER_TYPE_UD || \ 815 reg.type == BRW_REGISTER_TYPE_D) 816 817 /* "When source or destination datatype is 64b or operation is integer DWord 818 * multiply, DepCtrl must not be used." 819 * May apply to future SoCs as well. 820 */ 821 if (devinfo->is_cherryview) { 822 if (inst->opcode == BRW_OPCODE_MUL && 823 IS_DWORD(inst->src[0]) && 824 IS_DWORD(inst->src[1])) 825 return true; 826 } 827#undef IS_DWORD 828 829 if (devinfo->gen >= 8) { 830 if (inst->opcode == BRW_OPCODE_F32TO16) 831 return true; 832 } 833 834 /* 835 * mlen: 836 * In the presence of send messages, totally interrupt dependency 837 * control. They're long enough that the chance of dependency 838 * control around them just doesn't matter. 839 * 840 * predicate: 841 * From the Ivy Bridge PRM, volume 4 part 3.7, page 80: 842 * When a sequence of NoDDChk and NoDDClr are used, the last instruction that 843 * completes the scoreboard clear must have a non-zero execution mask. This 844 * means, if any kind of predication can change the execution mask or channel 845 * enable of the last instruction, the optimization must be avoided. This is 846 * to avoid instructions being shot down the pipeline when no writes are 847 * required. 848 * 849 * math: 850 * Dependency control does not work well over math instructions. 851 * NB: Discovered empirically 852 */ 853 return (inst->mlen || inst->predicate || inst->is_math()); 854} 855 856/** 857 * Sets the dependency control fields on instructions after register 858 * allocation and before the generator is run. 859 * 860 * When you have a sequence of instructions like: 861 * 862 * DP4 temp.x vertex uniform[0] 863 * DP4 temp.y vertex uniform[0] 864 * DP4 temp.z vertex uniform[0] 865 * DP4 temp.w vertex uniform[0] 866 * 867 * The hardware doesn't know that it can actually run the later instructions 868 * while the previous ones are in flight, producing stalls. However, we have 869 * manual fields we can set in the instructions that let it do so. 870 */ 871void 872vec4_visitor::opt_set_dependency_control() 873{ 874 vec4_instruction *last_grf_write[BRW_MAX_GRF]; 875 uint8_t grf_channels_written[BRW_MAX_GRF]; 876 vec4_instruction *last_mrf_write[BRW_MAX_GRF]; 877 uint8_t mrf_channels_written[BRW_MAX_GRF]; 878 879 assert(prog_data->total_grf || 880 !"Must be called after register allocation"); 881 882 foreach_block (block, cfg) { 883 memset(last_grf_write, 0, sizeof(last_grf_write)); 884 memset(last_mrf_write, 0, sizeof(last_mrf_write)); 885 886 foreach_inst_in_block (vec4_instruction, inst, block) { 887 /* If we read from a register that we were doing dependency control 888 * on, don't do dependency control across the read. 889 */ 890 for (int i = 0; i < 3; i++) { 891 int reg = inst->src[i].reg + inst->src[i].reg_offset; 892 if (inst->src[i].file == GRF) { 893 last_grf_write[reg] = NULL; 894 } else if (inst->src[i].file == HW_REG) { 895 memset(last_grf_write, 0, sizeof(last_grf_write)); 896 break; 897 } 898 assert(inst->src[i].file != MRF); 899 } 900 901 if (is_dep_ctrl_unsafe(inst)) { 902 memset(last_grf_write, 0, sizeof(last_grf_write)); 903 memset(last_mrf_write, 0, sizeof(last_mrf_write)); 904 continue; 905 } 906 907 /* Now, see if we can do dependency control for this instruction 908 * against a previous one writing to its destination. 909 */ 910 int reg = inst->dst.reg + inst->dst.reg_offset; 911 if (inst->dst.file == GRF) { 912 if (last_grf_write[reg] && 913 !(inst->dst.writemask & grf_channels_written[reg])) { 914 last_grf_write[reg]->no_dd_clear = true; 915 inst->no_dd_check = true; 916 } else { 917 grf_channels_written[reg] = 0; 918 } 919 920 last_grf_write[reg] = inst; 921 grf_channels_written[reg] |= inst->dst.writemask; 922 } else if (inst->dst.file == MRF) { 923 if (last_mrf_write[reg] && 924 !(inst->dst.writemask & mrf_channels_written[reg])) { 925 last_mrf_write[reg]->no_dd_clear = true; 926 inst->no_dd_check = true; 927 } else { 928 mrf_channels_written[reg] = 0; 929 } 930 931 last_mrf_write[reg] = inst; 932 mrf_channels_written[reg] |= inst->dst.writemask; 933 } else if (inst->dst.reg == HW_REG) { 934 if (inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) 935 memset(last_grf_write, 0, sizeof(last_grf_write)); 936 if (inst->dst.fixed_hw_reg.file == BRW_MESSAGE_REGISTER_FILE) 937 memset(last_mrf_write, 0, sizeof(last_mrf_write)); 938 } 939 } 940 } 941} 942 943bool 944vec4_instruction::can_reswizzle(int dst_writemask, 945 int swizzle, 946 int swizzle_mask) 947{ 948 /* If this instruction sets anything not referenced by swizzle, then we'd 949 * totally break it when we reswizzle. 950 */ 951 if (dst.writemask & ~swizzle_mask) 952 return false; 953 954 if (mlen > 0) 955 return false; 956 957 return true; 958} 959 960/** 961 * For any channels in the swizzle's source that were populated by this 962 * instruction, rewrite the instruction to put the appropriate result directly 963 * in those channels. 964 * 965 * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x 966 */ 967void 968vec4_instruction::reswizzle(int dst_writemask, int swizzle) 969{ 970 /* Destination write mask doesn't correspond to source swizzle for the dot 971 * product and pack_bytes instructions. 972 */ 973 if (opcode != BRW_OPCODE_DP4 && opcode != BRW_OPCODE_DPH && 974 opcode != BRW_OPCODE_DP3 && opcode != BRW_OPCODE_DP2 && 975 opcode != VEC4_OPCODE_PACK_BYTES) { 976 for (int i = 0; i < 3; i++) { 977 if (src[i].file == BAD_FILE || src[i].file == IMM) 978 continue; 979 980 src[i].swizzle = brw_compose_swizzle(swizzle, src[i].swizzle); 981 } 982 } 983 984 /* Apply the specified swizzle and writemask to the original mask of 985 * written components. 986 */ 987 dst.writemask = dst_writemask & 988 brw_apply_swizzle_to_mask(swizzle, dst.writemask); 989} 990 991/* 992 * Tries to reduce extra MOV instructions by taking temporary GRFs that get 993 * just written and then MOVed into another reg and making the original write 994 * of the GRF write directly to the final destination instead. 995 */ 996bool 997vec4_visitor::opt_register_coalesce() 998{ 999 bool progress = false; 1000 int next_ip = 0; 1001 1002 calculate_live_intervals(); 1003 1004 foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) { 1005 int ip = next_ip; 1006 next_ip++; 1007 1008 if (inst->opcode != BRW_OPCODE_MOV || 1009 (inst->dst.file != GRF && inst->dst.file != MRF) || 1010 inst->predicate || 1011 inst->src[0].file != GRF || 1012 inst->dst.type != inst->src[0].type || 1013 inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr) 1014 continue; 1015 1016 bool to_mrf = (inst->dst.file == MRF); 1017 1018 /* Can't coalesce this GRF if someone else was going to 1019 * read it later. 1020 */ 1021 if (var_range_end(var_from_reg(alloc, inst->src[0]), 4) > ip) 1022 continue; 1023 1024 /* We need to check interference with the final destination between this 1025 * instruction and the earliest instruction involved in writing the GRF 1026 * we're eliminating. To do that, keep track of which of our source 1027 * channels we've seen initialized. 1028 */ 1029 const unsigned chans_needed = 1030 brw_apply_inv_swizzle_to_mask(inst->src[0].swizzle, 1031 inst->dst.writemask); 1032 unsigned chans_remaining = chans_needed; 1033 1034 /* Now walk up the instruction stream trying to see if we can rewrite 1035 * everything writing to the temporary to write into the destination 1036 * instead. 1037 */ 1038 vec4_instruction *_scan_inst = (vec4_instruction *)inst->prev; 1039 foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst, 1040 inst, block) { 1041 _scan_inst = scan_inst; 1042 1043 if (inst->src[0].in_range(scan_inst->dst, scan_inst->regs_written)) { 1044 /* Found something writing to the reg we want to coalesce away. */ 1045 if (to_mrf) { 1046 /* SEND instructions can't have MRF as a destination. */ 1047 if (scan_inst->mlen) 1048 break; 1049 1050 if (devinfo->gen == 6) { 1051 /* gen6 math instructions must have the destination be 1052 * GRF, so no compute-to-MRF for them. 1053 */ 1054 if (scan_inst->is_math()) { 1055 break; 1056 } 1057 } 1058 } 1059 1060 /* If we can't handle the swizzle, bail. */ 1061 if (!scan_inst->can_reswizzle(inst->dst.writemask, 1062 inst->src[0].swizzle, 1063 chans_needed)) { 1064 break; 1065 } 1066 1067 /* This doesn't handle coalescing of multiple registers. */ 1068 if (scan_inst->regs_written > 1) 1069 break; 1070 1071 /* Mark which channels we found unconditional writes for. */ 1072 if (!scan_inst->predicate) 1073 chans_remaining &= ~scan_inst->dst.writemask; 1074 1075 if (chans_remaining == 0) 1076 break; 1077 } 1078 1079 /* You can't read from an MRF, so if someone else reads our MRF's 1080 * source GRF that we wanted to rewrite, that stops us. If it's a 1081 * GRF we're trying to coalesce to, we don't actually handle 1082 * rewriting sources so bail in that case as well. 1083 */ 1084 bool interfered = false; 1085 for (int i = 0; i < 3; i++) { 1086 if (inst->src[0].in_range(scan_inst->src[i], 1087 scan_inst->regs_read(i))) 1088 interfered = true; 1089 } 1090 if (interfered) 1091 break; 1092 1093 /* If somebody else writes our destination here, we can't coalesce 1094 * before that. 1095 */ 1096 if (inst->dst.in_range(scan_inst->dst, scan_inst->regs_written)) 1097 break; 1098 1099 /* Check for reads of the register we're trying to coalesce into. We 1100 * can't go rewriting instructions above that to put some other value 1101 * in the register instead. 1102 */ 1103 if (to_mrf && scan_inst->mlen > 0) { 1104 if (inst->dst.reg >= scan_inst->base_mrf && 1105 inst->dst.reg < scan_inst->base_mrf + scan_inst->mlen) { 1106 break; 1107 } 1108 } else { 1109 for (int i = 0; i < 3; i++) { 1110 if (inst->dst.in_range(scan_inst->src[i], 1111 scan_inst->regs_read(i))) 1112 interfered = true; 1113 } 1114 if (interfered) 1115 break; 1116 } 1117 } 1118 1119 if (chans_remaining == 0) { 1120 /* If we've made it here, we have an MOV we want to coalesce out, and 1121 * a scan_inst pointing to the earliest instruction involved in 1122 * computing the value. Now go rewrite the instruction stream 1123 * between the two. 1124 */ 1125 vec4_instruction *scan_inst = _scan_inst; 1126 while (scan_inst != inst) { 1127 if (scan_inst->dst.file == GRF && 1128 scan_inst->dst.reg == inst->src[0].reg && 1129 scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 1130 scan_inst->reswizzle(inst->dst.writemask, 1131 inst->src[0].swizzle); 1132 scan_inst->dst.file = inst->dst.file; 1133 scan_inst->dst.reg = inst->dst.reg; 1134 scan_inst->dst.reg_offset = inst->dst.reg_offset; 1135 scan_inst->saturate |= inst->saturate; 1136 } 1137 scan_inst = (vec4_instruction *)scan_inst->next; 1138 } 1139 inst->remove(block); 1140 progress = true; 1141 } 1142 } 1143 1144 if (progress) 1145 invalidate_live_intervals(); 1146 1147 return progress; 1148} 1149 1150/** 1151 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control 1152 * flow. We could probably do better here with some form of divergence 1153 * analysis. 1154 */ 1155bool 1156vec4_visitor::eliminate_find_live_channel() 1157{ 1158 bool progress = false; 1159 unsigned depth = 0; 1160 1161 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { 1162 switch (inst->opcode) { 1163 case BRW_OPCODE_IF: 1164 case BRW_OPCODE_DO: 1165 depth++; 1166 break; 1167 1168 case BRW_OPCODE_ENDIF: 1169 case BRW_OPCODE_WHILE: 1170 depth--; 1171 break; 1172 1173 case SHADER_OPCODE_FIND_LIVE_CHANNEL: 1174 if (depth == 0) { 1175 inst->opcode = BRW_OPCODE_MOV; 1176 inst->src[0] = src_reg(0); 1177 inst->force_writemask_all = true; 1178 progress = true; 1179 } 1180 break; 1181 1182 default: 1183 break; 1184 } 1185 } 1186 1187 return progress; 1188} 1189 1190/** 1191 * Splits virtual GRFs requesting more than one contiguous physical register. 1192 * 1193 * We initially create large virtual GRFs for temporary structures, arrays, 1194 * and matrices, so that the dereference visitor functions can add reg_offsets 1195 * to work their way down to the actual member being accessed. But when it 1196 * comes to optimization, we'd like to treat each register as individual 1197 * storage if possible. 1198 * 1199 * So far, the only thing that might prevent splitting is a send message from 1200 * a GRF on IVB. 1201 */ 1202void 1203vec4_visitor::split_virtual_grfs() 1204{ 1205 int num_vars = this->alloc.count; 1206 int new_virtual_grf[num_vars]; 1207 bool split_grf[num_vars]; 1208 1209 memset(new_virtual_grf, 0, sizeof(new_virtual_grf)); 1210 1211 /* Try to split anything > 0 sized. */ 1212 for (int i = 0; i < num_vars; i++) { 1213 split_grf[i] = this->alloc.sizes[i] != 1; 1214 } 1215 1216 /* Check that the instructions are compatible with the registers we're trying 1217 * to split. 1218 */ 1219 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 1220 if (inst->dst.file == GRF && inst->regs_written > 1) 1221 split_grf[inst->dst.reg] = false; 1222 1223 for (int i = 0; i < 3; i++) { 1224 if (inst->src[i].file == GRF && inst->regs_read(i) > 1) 1225 split_grf[inst->src[i].reg] = false; 1226 } 1227 } 1228 1229 /* Allocate new space for split regs. Note that the virtual 1230 * numbers will be contiguous. 1231 */ 1232 for (int i = 0; i < num_vars; i++) { 1233 if (!split_grf[i]) 1234 continue; 1235 1236 new_virtual_grf[i] = alloc.allocate(1); 1237 for (unsigned j = 2; j < this->alloc.sizes[i]; j++) { 1238 unsigned reg = alloc.allocate(1); 1239 assert(reg == new_virtual_grf[i] + j - 1); 1240 (void) reg; 1241 } 1242 this->alloc.sizes[i] = 1; 1243 } 1244 1245 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 1246 if (inst->dst.file == GRF && split_grf[inst->dst.reg] && 1247 inst->dst.reg_offset != 0) { 1248 inst->dst.reg = (new_virtual_grf[inst->dst.reg] + 1249 inst->dst.reg_offset - 1); 1250 inst->dst.reg_offset = 0; 1251 } 1252 for (int i = 0; i < 3; i++) { 1253 if (inst->src[i].file == GRF && split_grf[inst->src[i].reg] && 1254 inst->src[i].reg_offset != 0) { 1255 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] + 1256 inst->src[i].reg_offset - 1); 1257 inst->src[i].reg_offset = 0; 1258 } 1259 } 1260 } 1261 invalidate_live_intervals(); 1262} 1263 1264void 1265vec4_visitor::dump_instruction(backend_instruction *be_inst) 1266{ 1267 dump_instruction(be_inst, stderr); 1268} 1269 1270void 1271vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) 1272{ 1273 vec4_instruction *inst = (vec4_instruction *)be_inst; 1274 1275 if (inst->predicate) { 1276 fprintf(file, "(%cf0.%d) ", 1277 inst->predicate_inverse ? '-' : '+', 1278 inst->flag_subreg); 1279 } 1280 1281 fprintf(file, "%s", brw_instruction_name(inst->opcode)); 1282 if (inst->saturate) 1283 fprintf(file, ".sat"); 1284 if (inst->conditional_mod) { 1285 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]); 1286 if (!inst->predicate && 1287 (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL && 1288 inst->opcode != BRW_OPCODE_IF && 1289 inst->opcode != BRW_OPCODE_WHILE))) { 1290 fprintf(file, ".f0.%d", inst->flag_subreg); 1291 } 1292 } 1293 fprintf(file, " "); 1294 1295 switch (inst->dst.file) { 1296 case GRF: 1297 fprintf(file, "vgrf%d.%d", inst->dst.reg, inst->dst.reg_offset); 1298 break; 1299 case MRF: 1300 fprintf(file, "m%d", inst->dst.reg); 1301 break; 1302 case HW_REG: 1303 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) { 1304 switch (inst->dst.fixed_hw_reg.nr) { 1305 case BRW_ARF_NULL: 1306 fprintf(file, "null"); 1307 break; 1308 case BRW_ARF_ADDRESS: 1309 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr); 1310 break; 1311 case BRW_ARF_ACCUMULATOR: 1312 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr); 1313 break; 1314 case BRW_ARF_FLAG: 1315 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf, 1316 inst->dst.fixed_hw_reg.subnr); 1317 break; 1318 default: 1319 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf, 1320 inst->dst.fixed_hw_reg.subnr); 1321 break; 1322 } 1323 } else { 1324 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr); 1325 } 1326 if (inst->dst.fixed_hw_reg.subnr) 1327 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr); 1328 break; 1329 case BAD_FILE: 1330 fprintf(file, "(null)"); 1331 break; 1332 default: 1333 fprintf(file, "???"); 1334 break; 1335 } 1336 if (inst->dst.writemask != WRITEMASK_XYZW) { 1337 fprintf(file, "."); 1338 if (inst->dst.writemask & 1) 1339 fprintf(file, "x"); 1340 if (inst->dst.writemask & 2) 1341 fprintf(file, "y"); 1342 if (inst->dst.writemask & 4) 1343 fprintf(file, "z"); 1344 if (inst->dst.writemask & 8) 1345 fprintf(file, "w"); 1346 } 1347 fprintf(file, ":%s", brw_reg_type_letters(inst->dst.type)); 1348 1349 if (inst->src[0].file != BAD_FILE) 1350 fprintf(file, ", "); 1351 1352 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) { 1353 if (inst->src[i].negate) 1354 fprintf(file, "-"); 1355 if (inst->src[i].abs) 1356 fprintf(file, "|"); 1357 switch (inst->src[i].file) { 1358 case GRF: 1359 fprintf(file, "vgrf%d", inst->src[i].reg); 1360 break; 1361 case ATTR: 1362 fprintf(file, "attr%d", inst->src[i].reg); 1363 break; 1364 case UNIFORM: 1365 fprintf(file, "u%d", inst->src[i].reg); 1366 break; 1367 case IMM: 1368 switch (inst->src[i].type) { 1369 case BRW_REGISTER_TYPE_F: 1370 fprintf(file, "%fF", inst->src[i].fixed_hw_reg.dw1.f); 1371 break; 1372 case BRW_REGISTER_TYPE_D: 1373 fprintf(file, "%dD", inst->src[i].fixed_hw_reg.dw1.d); 1374 break; 1375 case BRW_REGISTER_TYPE_UD: 1376 fprintf(file, "%uU", inst->src[i].fixed_hw_reg.dw1.ud); 1377 break; 1378 case BRW_REGISTER_TYPE_VF: 1379 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]", 1380 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff), 1381 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff), 1382 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff), 1383 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff)); 1384 break; 1385 default: 1386 fprintf(file, "???"); 1387 break; 1388 } 1389 break; 1390 case HW_REG: 1391 if (inst->src[i].fixed_hw_reg.negate) 1392 fprintf(file, "-"); 1393 if (inst->src[i].fixed_hw_reg.abs) 1394 fprintf(file, "|"); 1395 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) { 1396 switch (inst->src[i].fixed_hw_reg.nr) { 1397 case BRW_ARF_NULL: 1398 fprintf(file, "null"); 1399 break; 1400 case BRW_ARF_ADDRESS: 1401 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr); 1402 break; 1403 case BRW_ARF_ACCUMULATOR: 1404 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr); 1405 break; 1406 case BRW_ARF_FLAG: 1407 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf, 1408 inst->src[i].fixed_hw_reg.subnr); 1409 break; 1410 default: 1411 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf, 1412 inst->src[i].fixed_hw_reg.subnr); 1413 break; 1414 } 1415 } else { 1416 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr); 1417 } 1418 if (inst->src[i].fixed_hw_reg.subnr) 1419 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr); 1420 if (inst->src[i].fixed_hw_reg.abs) 1421 fprintf(file, "|"); 1422 break; 1423 case BAD_FILE: 1424 fprintf(file, "(null)"); 1425 break; 1426 default: 1427 fprintf(file, "???"); 1428 break; 1429 } 1430 1431 /* Don't print .0; and only VGRFs have reg_offsets and sizes */ 1432 if (inst->src[i].reg_offset != 0 && 1433 inst->src[i].file == GRF && 1434 alloc.sizes[inst->src[i].reg] != 1) 1435 fprintf(file, ".%d", inst->src[i].reg_offset); 1436 1437 if (inst->src[i].file != IMM) { 1438 static const char *chans[4] = {"x", "y", "z", "w"}; 1439 fprintf(file, "."); 1440 for (int c = 0; c < 4; c++) { 1441 fprintf(file, "%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]); 1442 } 1443 } 1444 1445 if (inst->src[i].abs) 1446 fprintf(file, "|"); 1447 1448 if (inst->src[i].file != IMM) { 1449 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type)); 1450 } 1451 1452 if (i < 2 && inst->src[i + 1].file != BAD_FILE) 1453 fprintf(file, ", "); 1454 } 1455 1456 fprintf(file, "\n"); 1457} 1458 1459 1460static inline struct brw_reg 1461attribute_to_hw_reg(int attr, bool interleaved) 1462{ 1463 if (interleaved) 1464 return stride(brw_vec4_grf(attr / 2, (attr % 2) * 4), 0, 4, 1); 1465 else 1466 return brw_vec8_grf(attr, 0); 1467} 1468 1469 1470/** 1471 * Replace each register of type ATTR in this->instructions with a reference 1472 * to a fixed HW register. 1473 * 1474 * If interleaved is true, then each attribute takes up half a register, with 1475 * register N containing attribute 2*N in its first half and attribute 2*N+1 1476 * in its second half (this corresponds to the payload setup used by geometry 1477 * shaders in "single" or "dual instanced" dispatch mode). If interleaved is 1478 * false, then each attribute takes up a whole register, with register N 1479 * containing attribute N (this corresponds to the payload setup used by 1480 * vertex shaders, and by geometry shaders in "dual object" dispatch mode). 1481 */ 1482void 1483vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map, 1484 bool interleaved) 1485{ 1486 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 1487 /* We have to support ATTR as a destination for GL_FIXED fixup. */ 1488 if (inst->dst.file == ATTR) { 1489 int grf = attribute_map[inst->dst.reg + inst->dst.reg_offset]; 1490 1491 /* All attributes used in the shader need to have been assigned a 1492 * hardware register by the caller 1493 */ 1494 assert(grf != 0); 1495 1496 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved); 1497 reg.type = inst->dst.type; 1498 reg.dw1.bits.writemask = inst->dst.writemask; 1499 1500 inst->dst.file = HW_REG; 1501 inst->dst.fixed_hw_reg = reg; 1502 } 1503 1504 for (int i = 0; i < 3; i++) { 1505 if (inst->src[i].file != ATTR) 1506 continue; 1507 1508 int grf = attribute_map[inst->src[i].reg + inst->src[i].reg_offset]; 1509 1510 /* All attributes used in the shader need to have been assigned a 1511 * hardware register by the caller 1512 */ 1513 assert(grf != 0); 1514 1515 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved); 1516 reg.dw1.bits.swizzle = inst->src[i].swizzle; 1517 reg.type = inst->src[i].type; 1518 if (inst->src[i].abs) 1519 reg = brw_abs(reg); 1520 if (inst->src[i].negate) 1521 reg = negate(reg); 1522 1523 inst->src[i].file = HW_REG; 1524 inst->src[i].fixed_hw_reg = reg; 1525 } 1526 } 1527} 1528 1529int 1530vec4_vs_visitor::setup_attributes(int payload_reg) 1531{ 1532 int nr_attributes; 1533 int attribute_map[VERT_ATTRIB_MAX + 1]; 1534 memset(attribute_map, 0, sizeof(attribute_map)); 1535 1536 nr_attributes = 0; 1537 for (int i = 0; i < VERT_ATTRIB_MAX; i++) { 1538 if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) { 1539 attribute_map[i] = payload_reg + nr_attributes; 1540 nr_attributes++; 1541 } 1542 } 1543 1544 /* VertexID is stored by the VF as the last vertex element, but we 1545 * don't represent it with a flag in inputs_read, so we call it 1546 * VERT_ATTRIB_MAX. 1547 */ 1548 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) { 1549 attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes; 1550 nr_attributes++; 1551 } 1552 1553 lower_attributes_to_hw_regs(attribute_map, false /* interleaved */); 1554 1555 /* The BSpec says we always have to read at least one thing from 1556 * the VF, and it appears that the hardware wedges otherwise. 1557 */ 1558 if (nr_attributes == 0) 1559 nr_attributes = 1; 1560 1561 prog_data->urb_read_length = (nr_attributes + 1) / 2; 1562 1563 unsigned vue_entries = 1564 MAX2(nr_attributes, prog_data->vue_map.num_slots); 1565 1566 if (devinfo->gen == 6) 1567 prog_data->urb_entry_size = ALIGN(vue_entries, 8) / 8; 1568 else 1569 prog_data->urb_entry_size = ALIGN(vue_entries, 4) / 4; 1570 1571 return payload_reg + nr_attributes; 1572} 1573 1574int 1575vec4_visitor::setup_uniforms(int reg) 1576{ 1577 prog_data->base.dispatch_grf_start_reg = reg; 1578 1579 /* The pre-gen6 VS requires that some push constants get loaded no 1580 * matter what, or the GPU would hang. 1581 */ 1582 if (devinfo->gen < 6 && this->uniforms == 0) { 1583 assert(this->uniforms < this->uniform_array_size); 1584 this->uniform_vector_size[this->uniforms] = 1; 1585 1586 stage_prog_data->param = 1587 reralloc(NULL, stage_prog_data->param, const gl_constant_value *, 4); 1588 for (unsigned int i = 0; i < 4; i++) { 1589 unsigned int slot = this->uniforms * 4 + i; 1590 static gl_constant_value zero = { 0.0 }; 1591 stage_prog_data->param[slot] = &zero; 1592 } 1593 1594 this->uniforms++; 1595 reg++; 1596 } else { 1597 reg += ALIGN(uniforms, 2) / 2; 1598 } 1599 1600 stage_prog_data->nr_params = this->uniforms * 4; 1601 1602 prog_data->base.curb_read_length = 1603 reg - prog_data->base.dispatch_grf_start_reg; 1604 1605 return reg; 1606} 1607 1608void 1609vec4_vs_visitor::setup_payload(void) 1610{ 1611 int reg = 0; 1612 1613 /* The payload always contains important data in g0, which contains 1614 * the URB handles that are passed on to the URB write at the end 1615 * of the thread. So, we always start push constants at g1. 1616 */ 1617 reg++; 1618 1619 reg = setup_uniforms(reg); 1620 1621 reg = setup_attributes(reg); 1622 1623 this->first_non_payload_grf = reg; 1624} 1625 1626void 1627vec4_visitor::assign_binding_table_offsets() 1628{ 1629 assign_common_binding_table_offsets(0); 1630} 1631 1632src_reg 1633vec4_visitor::get_timestamp() 1634{ 1635 assert(devinfo->gen >= 7); 1636 1637 src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE, 1638 BRW_ARF_TIMESTAMP, 1639 0, 1640 0, 1641 0, 1642 BRW_REGISTER_TYPE_UD, 1643 BRW_VERTICAL_STRIDE_0, 1644 BRW_WIDTH_4, 1645 BRW_HORIZONTAL_STRIDE_4, 1646 BRW_SWIZZLE_XYZW, 1647 WRITEMASK_XYZW)); 1648 1649 dst_reg dst = dst_reg(this, glsl_type::uvec4_type); 1650 1651 vec4_instruction *mov = emit(MOV(dst, ts)); 1652 /* We want to read the 3 fields we care about (mostly field 0, but also 2) 1653 * even if it's not enabled in the dispatch. 1654 */ 1655 mov->force_writemask_all = true; 1656 1657 return src_reg(dst); 1658} 1659 1660void 1661vec4_visitor::emit_shader_time_begin() 1662{ 1663 current_annotation = "shader time start"; 1664 shader_start_time = get_timestamp(); 1665} 1666 1667void 1668vec4_visitor::emit_shader_time_end() 1669{ 1670 current_annotation = "shader time end"; 1671 src_reg shader_end_time = get_timestamp(); 1672 1673 1674 /* Check that there weren't any timestamp reset events (assuming these 1675 * were the only two timestamp reads that happened). 1676 */ 1677 src_reg reset_end = shader_end_time; 1678 reset_end.swizzle = BRW_SWIZZLE_ZZZZ; 1679 vec4_instruction *test = emit(AND(dst_null_d(), reset_end, src_reg(1u))); 1680 test->conditional_mod = BRW_CONDITIONAL_Z; 1681 1682 emit(IF(BRW_PREDICATE_NORMAL)); 1683 1684 /* Take the current timestamp and get the delta. */ 1685 shader_start_time.negate = true; 1686 dst_reg diff = dst_reg(this, glsl_type::uint_type); 1687 emit(ADD(diff, shader_start_time, shader_end_time)); 1688 1689 /* If there were no instructions between the two timestamp gets, the diff 1690 * is 2 cycles. Remove that overhead, so I can forget about that when 1691 * trying to determine the time taken for single instructions. 1692 */ 1693 emit(ADD(diff, src_reg(diff), src_reg(-2u))); 1694 1695 emit_shader_time_write(0, src_reg(diff)); 1696 emit_shader_time_write(1, src_reg(1u)); 1697 emit(BRW_OPCODE_ELSE); 1698 emit_shader_time_write(2, src_reg(1u)); 1699 emit(BRW_OPCODE_ENDIF); 1700} 1701 1702void 1703vec4_visitor::emit_shader_time_write(int shader_time_subindex, src_reg value) 1704{ 1705 dst_reg dst = 1706 dst_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 2)); 1707 1708 dst_reg offset = dst; 1709 dst_reg time = dst; 1710 time.reg_offset++; 1711 1712 offset.type = BRW_REGISTER_TYPE_UD; 1713 int index = shader_time_index * 3 + shader_time_subindex; 1714 emit(MOV(offset, src_reg(index * SHADER_TIME_STRIDE))); 1715 1716 time.type = BRW_REGISTER_TYPE_UD; 1717 emit(MOV(time, src_reg(value))); 1718 1719 vec4_instruction *inst = 1720 emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(dst)); 1721 inst->mlen = 2; 1722} 1723 1724bool 1725vec4_visitor::run() 1726{ 1727 bool use_vec4_nir = 1728 compiler->glsl_compiler_options[stage].NirOptions != NULL; 1729 1730 sanity_param_count = prog->Parameters->NumParameters; 1731 1732 if (shader_time_index >= 0) 1733 emit_shader_time_begin(); 1734 1735 assign_binding_table_offsets(); 1736 1737 emit_prolog(); 1738 1739 if (use_vec4_nir) { 1740 assert(prog->nir != NULL); 1741 emit_nir_code(); 1742 if (failed) 1743 return false; 1744 } else if (shader) { 1745 /* Generate VS IR for main(). (the visitor only descends into 1746 * functions called "main"). 1747 */ 1748 visit_instructions(shader->base.ir); 1749 } else { 1750 emit_program_code(); 1751 } 1752 base_ir = NULL; 1753 1754 emit_thread_end(); 1755 1756 calculate_cfg(); 1757 1758 /* Before any optimization, push array accesses out to scratch 1759 * space where we need them to be. This pass may allocate new 1760 * virtual GRFs, so we want to do it early. It also makes sure 1761 * that we have reladdr computations available for CSE, since we'll 1762 * often do repeated subexpressions for those. 1763 */ 1764 if (shader || use_vec4_nir) { 1765 move_grf_array_access_to_scratch(); 1766 move_uniform_array_access_to_pull_constants(); 1767 } else { 1768 /* The ARB_vertex_program frontend emits pull constant loads directly 1769 * rather than using reladdr, so we don't need to walk through all the 1770 * instructions looking for things to move. There isn't anything. 1771 * 1772 * We do still need to split things to vec4 size. 1773 */ 1774 split_uniform_registers(); 1775 } 1776 pack_uniform_registers(); 1777 move_push_constants_to_pull_constants(); 1778 split_virtual_grfs(); 1779 1780#define OPT(pass, args...) ({ \ 1781 pass_num++; \ 1782 bool this_progress = pass(args); \ 1783 \ 1784 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \ 1785 char filename[64]; \ 1786 snprintf(filename, 64, "%s-%04d-%02d-%02d-" #pass, \ 1787 stage_abbrev, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \ 1788 \ 1789 backend_shader::dump_instructions(filename); \ 1790 } \ 1791 \ 1792 progress = progress || this_progress; \ 1793 this_progress; \ 1794 }) 1795 1796 1797 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) { 1798 char filename[64]; 1799 snprintf(filename, 64, "%s-%04d-00-start", 1800 stage_abbrev, shader_prog ? shader_prog->Name : 0); 1801 1802 backend_shader::dump_instructions(filename); 1803 } 1804 1805 bool progress; 1806 int iteration = 0; 1807 int pass_num = 0; 1808 do { 1809 progress = false; 1810 pass_num = 0; 1811 iteration++; 1812 1813 OPT(opt_reduce_swizzle); 1814 OPT(dead_code_eliminate); 1815 OPT(dead_control_flow_eliminate, this); 1816 OPT(opt_copy_propagation); 1817 OPT(opt_cse); 1818 OPT(opt_algebraic); 1819 OPT(opt_register_coalesce); 1820 OPT(eliminate_find_live_channel); 1821 } while (progress); 1822 1823 pass_num = 0; 1824 1825 if (OPT(opt_vector_float)) { 1826 OPT(opt_cse); 1827 OPT(opt_copy_propagation, false); 1828 OPT(opt_copy_propagation, true); 1829 OPT(dead_code_eliminate); 1830 } 1831 1832 if (failed) 1833 return false; 1834 1835 setup_payload(); 1836 1837 if (unlikely(INTEL_DEBUG & DEBUG_SPILL_VEC4)) { 1838 /* Debug of register spilling: Go spill everything. */ 1839 const int grf_count = alloc.count; 1840 float spill_costs[alloc.count]; 1841 bool no_spill[alloc.count]; 1842 evaluate_spill_costs(spill_costs, no_spill); 1843 for (int i = 0; i < grf_count; i++) { 1844 if (no_spill[i]) 1845 continue; 1846 spill_reg(i); 1847 } 1848 } 1849 1850 bool allocated_without_spills = reg_allocate(); 1851 1852 if (!allocated_without_spills) { 1853 compiler->shader_perf_log(log_data, 1854 "%s shader triggered register spilling. " 1855 "Try reducing the number of live vec4 values " 1856 "to improve performance.\n", 1857 stage_name); 1858 1859 while (!reg_allocate()) { 1860 if (failed) 1861 return false; 1862 } 1863 } 1864 1865 opt_schedule_instructions(); 1866 1867 opt_set_dependency_control(); 1868 1869 if (last_scratch > 0) { 1870 prog_data->base.total_scratch = 1871 brw_get_scratch_size(last_scratch * REG_SIZE); 1872 } 1873 1874 /* If any state parameters were appended, then ParameterValues could have 1875 * been realloced, in which case the driver uniform storage set up by 1876 * _mesa_associate_uniform_storage() would point to freed memory. Make 1877 * sure that didn't happen. 1878 */ 1879 assert(sanity_param_count == prog->Parameters->NumParameters); 1880 1881 return !failed; 1882} 1883 1884} /* namespace brw */ 1885 1886extern "C" { 1887 1888/** 1889 * Compile a vertex shader. 1890 * 1891 * Returns the final assembly and the program's size. 1892 */ 1893const unsigned * 1894brw_vs_emit(struct brw_context *brw, 1895 void *mem_ctx, 1896 const struct brw_vs_prog_key *key, 1897 struct brw_vs_prog_data *prog_data, 1898 struct gl_vertex_program *vp, 1899 struct gl_shader_program *prog, 1900 unsigned *final_assembly_size) 1901{ 1902 bool start_busy = false; 1903 double start_time = 0; 1904 const unsigned *assembly = NULL; 1905 1906 if (unlikely(brw->perf_debug)) { 1907 start_busy = (brw->batch.last_bo && 1908 drm_intel_bo_busy(brw->batch.last_bo)); 1909 start_time = get_time(); 1910 } 1911 1912 struct brw_shader *shader = NULL; 1913 if (prog) 1914 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_VERTEX]; 1915 1916 int st_index = -1; 1917 if (INTEL_DEBUG & DEBUG_SHADER_TIME) 1918 st_index = brw_get_shader_time_index(brw, prog, &vp->Base, ST_VS); 1919 1920 if (unlikely(INTEL_DEBUG & DEBUG_VS)) 1921 brw_dump_ir("vertex", prog, &shader->base, &vp->Base); 1922 1923 if (!vp->Base.nir && 1924 (brw->intelScreen->compiler->scalar_vs || 1925 brw->intelScreen->compiler->glsl_compiler_options[MESA_SHADER_VERTEX].NirOptions != NULL)) { 1926 /* Normally we generate NIR in LinkShader() or 1927 * ProgramStringNotify(), but Mesa's fixed-function vertex program 1928 * handling doesn't notify the driver at all. Just do it here, at 1929 * the last minute, even though it's lame. 1930 */ 1931 assert(vp->Base.Id == 0 && prog == NULL); 1932 vp->Base.nir = 1933 brw_create_nir(brw, NULL, &vp->Base, MESA_SHADER_VERTEX, 1934 brw->intelScreen->compiler->scalar_vs); 1935 } 1936 1937 if (brw->intelScreen->compiler->scalar_vs) { 1938 prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8; 1939 1940 fs_visitor v(brw->intelScreen->compiler, brw, 1941 mem_ctx, MESA_SHADER_VERTEX, key, 1942 &prog_data->base.base, prog, &vp->Base, 1943 8, st_index); 1944 if (!v.run_vs(brw_select_clip_planes(&brw->ctx))) { 1945 if (prog) { 1946 prog->LinkStatus = false; 1947 ralloc_strcat(&prog->InfoLog, v.fail_msg); 1948 } 1949 1950 _mesa_problem(NULL, "Failed to compile vertex shader: %s\n", 1951 v.fail_msg); 1952 1953 return NULL; 1954 } 1955 1956 fs_generator g(brw->intelScreen->compiler, brw, 1957 mem_ctx, (void *) key, &prog_data->base.base, 1958 &vp->Base, v.promoted_constants, 1959 v.runtime_check_aads_emit, "VS"); 1960 if (INTEL_DEBUG & DEBUG_VS) { 1961 char *name; 1962 if (prog) { 1963 name = ralloc_asprintf(mem_ctx, "%s vertex shader %d", 1964 prog->Label ? prog->Label : "unnamed", 1965 prog->Name); 1966 } else { 1967 name = ralloc_asprintf(mem_ctx, "vertex program %d", 1968 vp->Base.Id); 1969 } 1970 g.enable_debug(name); 1971 } 1972 g.generate_code(v.cfg, 8); 1973 assembly = g.get_assembly(final_assembly_size); 1974 } 1975 1976 if (!assembly) { 1977 prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT; 1978 1979 vec4_vs_visitor v(brw->intelScreen->compiler, brw, key, prog_data, 1980 vp, prog, brw_select_clip_planes(&brw->ctx), 1981 mem_ctx, st_index, 1982 !_mesa_is_gles3(&brw->ctx)); 1983 if (!v.run()) { 1984 if (prog) { 1985 prog->LinkStatus = false; 1986 ralloc_strcat(&prog->InfoLog, v.fail_msg); 1987 } 1988 1989 _mesa_problem(NULL, "Failed to compile vertex shader: %s\n", 1990 v.fail_msg); 1991 1992 return NULL; 1993 } 1994 1995 vec4_generator g(brw->intelScreen->compiler, brw, 1996 prog, &vp->Base, &prog_data->base, 1997 mem_ctx, INTEL_DEBUG & DEBUG_VS, "vertex", "VS"); 1998 assembly = g.generate_assembly(v.cfg, final_assembly_size); 1999 } 2000 2001 if (unlikely(brw->perf_debug) && shader) { 2002 if (shader->compiled_once) { 2003 brw_vs_debug_recompile(brw, prog, key); 2004 } 2005 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) { 2006 perf_debug("VS compile took %.03f ms and stalled the GPU\n", 2007 (get_time() - start_time) * 1000); 2008 } 2009 shader->compiled_once = true; 2010 } 2011 2012 return assembly; 2013} 2014 2015} /* extern "C" */ 2016