brw_vec4.cpp revision 63d6d09a3b3790c5ec00f2cbc06f58c82ae40b0c
1/* 2 * Copyright © 2011 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "brw_vec4.h" 25#include "brw_fs.h" 26#include "brw_cfg.h" 27#include "brw_vs.h" 28#include "brw_dead_control_flow.h" 29 30extern "C" { 31#include "main/macros.h" 32#include "main/shaderobj.h" 33#include "program/prog_print.h" 34#include "program/prog_parameter.h" 35} 36 37#define MAX_INSTRUCTION (1 << 30) 38 39using namespace brw; 40 41namespace brw { 42 43/** 44 * Common helper for constructing swizzles. When only a subset of 45 * channels of a vec4 are used, we don't want to reference the other 46 * channels, as that will tell optimization passes that those other 47 * channels are used. 48 */ 49unsigned 50swizzle_for_size(int size) 51{ 52 static const unsigned size_swizzles[4] = { 53 BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X), 54 BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y), 55 BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z), 56 BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W), 57 }; 58 59 assert((size >= 1) && (size <= 4)); 60 return size_swizzles[size - 1]; 61} 62 63void 64src_reg::init() 65{ 66 memset(this, 0, sizeof(*this)); 67 68 this->file = BAD_FILE; 69} 70 71src_reg::src_reg(register_file file, int reg, const glsl_type *type) 72{ 73 init(); 74 75 this->file = file; 76 this->reg = reg; 77 if (type && (type->is_scalar() || type->is_vector() || type->is_matrix())) 78 this->swizzle = swizzle_for_size(type->vector_elements); 79 else 80 this->swizzle = BRW_SWIZZLE_XYZW; 81} 82 83/** Generic unset register constructor. */ 84src_reg::src_reg() 85{ 86 init(); 87} 88 89src_reg::src_reg(float f) 90{ 91 init(); 92 93 this->file = IMM; 94 this->type = BRW_REGISTER_TYPE_F; 95 this->fixed_hw_reg.dw1.f = f; 96} 97 98src_reg::src_reg(uint32_t u) 99{ 100 init(); 101 102 this->file = IMM; 103 this->type = BRW_REGISTER_TYPE_UD; 104 this->fixed_hw_reg.dw1.ud = u; 105} 106 107src_reg::src_reg(int32_t i) 108{ 109 init(); 110 111 this->file = IMM; 112 this->type = BRW_REGISTER_TYPE_D; 113 this->fixed_hw_reg.dw1.d = i; 114} 115 116src_reg::src_reg(uint8_t vf[4]) 117{ 118 init(); 119 120 this->file = IMM; 121 this->type = BRW_REGISTER_TYPE_VF; 122 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned)); 123} 124 125src_reg::src_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3) 126{ 127 init(); 128 129 this->file = IMM; 130 this->type = BRW_REGISTER_TYPE_VF; 131 this->fixed_hw_reg.dw1.ud = (vf0 << 0) | 132 (vf1 << 8) | 133 (vf2 << 16) | 134 (vf3 << 24); 135} 136 137src_reg::src_reg(struct brw_reg reg) 138{ 139 init(); 140 141 this->file = HW_REG; 142 this->fixed_hw_reg = reg; 143 this->type = reg.type; 144} 145 146src_reg::src_reg(dst_reg reg) 147{ 148 init(); 149 150 this->file = reg.file; 151 this->reg = reg.reg; 152 this->reg_offset = reg.reg_offset; 153 this->type = reg.type; 154 this->reladdr = reg.reladdr; 155 this->fixed_hw_reg = reg.fixed_hw_reg; 156 157 int swizzles[4]; 158 int next_chan = 0; 159 int last = 0; 160 161 for (int i = 0; i < 4; i++) { 162 if (!(reg.writemask & (1 << i))) 163 continue; 164 165 swizzles[next_chan++] = last = i; 166 } 167 168 for (; next_chan < 4; next_chan++) { 169 swizzles[next_chan] = last; 170 } 171 172 this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1], 173 swizzles[2], swizzles[3]); 174} 175 176void 177dst_reg::init() 178{ 179 memset(this, 0, sizeof(*this)); 180 this->file = BAD_FILE; 181 this->writemask = WRITEMASK_XYZW; 182} 183 184dst_reg::dst_reg() 185{ 186 init(); 187} 188 189dst_reg::dst_reg(register_file file, int reg) 190{ 191 init(); 192 193 this->file = file; 194 this->reg = reg; 195} 196 197dst_reg::dst_reg(register_file file, int reg, const glsl_type *type, 198 int writemask) 199{ 200 init(); 201 202 this->file = file; 203 this->reg = reg; 204 this->type = brw_type_for_base_type(type); 205 this->writemask = writemask; 206} 207 208dst_reg::dst_reg(struct brw_reg reg) 209{ 210 init(); 211 212 this->file = HW_REG; 213 this->fixed_hw_reg = reg; 214 this->type = reg.type; 215} 216 217dst_reg::dst_reg(src_reg reg) 218{ 219 init(); 220 221 this->file = reg.file; 222 this->reg = reg.reg; 223 this->reg_offset = reg.reg_offset; 224 this->type = reg.type; 225 /* How should we do writemasking when converting from a src_reg? It seems 226 * pretty obvious that for src.xxxx the caller wants to write to src.x, but 227 * what about for src.wx? Just special-case src.xxxx for now. 228 */ 229 if (reg.swizzle == BRW_SWIZZLE_XXXX) 230 this->writemask = WRITEMASK_X; 231 else 232 this->writemask = WRITEMASK_XYZW; 233 this->reladdr = reg.reladdr; 234 this->fixed_hw_reg = reg.fixed_hw_reg; 235} 236 237bool 238dst_reg::equals(const dst_reg &r) const 239{ 240 return (file == r.file && 241 reg == r.reg && 242 reg_offset == r.reg_offset && 243 type == r.type && 244 negate == r.negate && 245 abs == r.abs && 246 writemask == r.writemask && 247 (reladdr == r.reladdr || 248 (reladdr && r.reladdr && reladdr->equals(*r.reladdr))) && 249 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, 250 sizeof(fixed_hw_reg)) == 0); 251} 252 253bool 254vec4_instruction::is_send_from_grf() 255{ 256 switch (opcode) { 257 case SHADER_OPCODE_SHADER_TIME_ADD: 258 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: 259 return true; 260 default: 261 return false; 262 } 263} 264 265unsigned 266vec4_instruction::regs_read(unsigned arg) const 267{ 268 if (src[arg].file == BAD_FILE) 269 return 0; 270 271 switch (opcode) { 272 case SHADER_OPCODE_SHADER_TIME_ADD: 273 return arg == 0 ? mlen : 1; 274 275 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: 276 return arg == 1 ? mlen : 1; 277 278 default: 279 return 1; 280 } 281} 282 283bool 284vec4_instruction::can_do_source_mods(struct brw_context *brw) 285{ 286 if (brw->gen == 6 && is_math()) 287 return false; 288 289 if (is_send_from_grf()) 290 return false; 291 292 if (!backend_instruction::can_do_source_mods()) 293 return false; 294 295 return true; 296} 297 298/** 299 * Returns how many MRFs an opcode will write over. 300 * 301 * Note that this is not the 0 or 1 implied writes in an actual gen 302 * instruction -- the generate_* functions generate additional MOVs 303 * for setup. 304 */ 305int 306vec4_visitor::implied_mrf_writes(vec4_instruction *inst) 307{ 308 if (inst->mlen == 0 || inst->is_send_from_grf()) 309 return 0; 310 311 switch (inst->opcode) { 312 case SHADER_OPCODE_RCP: 313 case SHADER_OPCODE_RSQ: 314 case SHADER_OPCODE_SQRT: 315 case SHADER_OPCODE_EXP2: 316 case SHADER_OPCODE_LOG2: 317 case SHADER_OPCODE_SIN: 318 case SHADER_OPCODE_COS: 319 return 1; 320 case SHADER_OPCODE_INT_QUOTIENT: 321 case SHADER_OPCODE_INT_REMAINDER: 322 case SHADER_OPCODE_POW: 323 return 2; 324 case VS_OPCODE_URB_WRITE: 325 return 1; 326 case VS_OPCODE_PULL_CONSTANT_LOAD: 327 return 2; 328 case SHADER_OPCODE_GEN4_SCRATCH_READ: 329 return 2; 330 case SHADER_OPCODE_GEN4_SCRATCH_WRITE: 331 return 3; 332 case GS_OPCODE_URB_WRITE: 333 case GS_OPCODE_URB_WRITE_ALLOCATE: 334 case GS_OPCODE_THREAD_END: 335 return 0; 336 case GS_OPCODE_FF_SYNC: 337 return 1; 338 case SHADER_OPCODE_SHADER_TIME_ADD: 339 return 0; 340 case SHADER_OPCODE_TEX: 341 case SHADER_OPCODE_TXL: 342 case SHADER_OPCODE_TXD: 343 case SHADER_OPCODE_TXF: 344 case SHADER_OPCODE_TXF_CMS: 345 case SHADER_OPCODE_TXF_MCS: 346 case SHADER_OPCODE_TXS: 347 case SHADER_OPCODE_TG4: 348 case SHADER_OPCODE_TG4_OFFSET: 349 return inst->header_present ? 1 : 0; 350 case SHADER_OPCODE_UNTYPED_ATOMIC: 351 case SHADER_OPCODE_UNTYPED_SURFACE_READ: 352 return 0; 353 default: 354 unreachable("not reached"); 355 } 356} 357 358bool 359src_reg::equals(const src_reg &r) const 360{ 361 return (file == r.file && 362 reg == r.reg && 363 reg_offset == r.reg_offset && 364 type == r.type && 365 negate == r.negate && 366 abs == r.abs && 367 swizzle == r.swizzle && 368 !reladdr && !r.reladdr && 369 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, 370 sizeof(fixed_hw_reg)) == 0); 371} 372 373bool 374vec4_visitor::opt_vector_float() 375{ 376 bool progress = false; 377 378 int last_reg = -1, last_reg_offset = -1; 379 enum register_file last_reg_file = BAD_FILE; 380 381 int remaining_channels = 0; 382 uint8_t imm[4]; 383 int inst_count = 0; 384 vec4_instruction *imm_inst[4]; 385 386 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { 387 if (last_reg != inst->dst.reg || 388 last_reg_offset != inst->dst.reg_offset || 389 last_reg_file != inst->dst.file) { 390 last_reg = inst->dst.reg; 391 last_reg_offset = inst->dst.reg_offset; 392 last_reg_file = inst->dst.file; 393 remaining_channels = WRITEMASK_XYZW; 394 395 inst_count = 0; 396 } 397 398 if (inst->opcode != BRW_OPCODE_MOV || 399 inst->dst.writemask == WRITEMASK_XYZW || 400 inst->src[0].file != IMM) 401 continue; 402 403 int vf = brw_float_to_vf(inst->src[0].fixed_hw_reg.dw1.f); 404 if (vf == -1) 405 continue; 406 407 if ((inst->dst.writemask & WRITEMASK_X) != 0) 408 imm[0] = vf; 409 if ((inst->dst.writemask & WRITEMASK_Y) != 0) 410 imm[1] = vf; 411 if ((inst->dst.writemask & WRITEMASK_Z) != 0) 412 imm[2] = vf; 413 if ((inst->dst.writemask & WRITEMASK_W) != 0) 414 imm[3] = vf; 415 416 imm_inst[inst_count++] = inst; 417 418 remaining_channels &= ~inst->dst.writemask; 419 if (remaining_channels == 0) { 420 vec4_instruction *mov = MOV(inst->dst, imm); 421 mov->dst.type = BRW_REGISTER_TYPE_F; 422 mov->dst.writemask = WRITEMASK_XYZW; 423 inst->insert_after(block, mov); 424 last_reg = -1; 425 426 for (int i = 0; i < inst_count; i++) { 427 imm_inst[i]->remove(block); 428 } 429 progress = true; 430 } 431 } 432 433 if (progress) 434 invalidate_live_intervals(); 435 436 return progress; 437} 438 439/* Replaces unused channels of a swizzle with channels that are used. 440 * 441 * For instance, this pass transforms 442 * 443 * mov vgrf4.yz, vgrf5.wxzy 444 * 445 * into 446 * 447 * mov vgrf4.yz, vgrf5.xxzx 448 * 449 * This eliminates false uses of some channels, letting dead code elimination 450 * remove the instructions that wrote them. 451 */ 452bool 453vec4_visitor::opt_reduce_swizzle() 454{ 455 bool progress = false; 456 457 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { 458 if (inst->dst.file == BAD_FILE || inst->dst.file == HW_REG || 459 inst->is_send_from_grf()) 460 continue; 461 462 int swizzle[4]; 463 464 /* Determine which channels of the sources are read. */ 465 switch (inst->opcode) { 466 case VEC4_OPCODE_PACK_BYTES: 467 swizzle[0] = 0; 468 swizzle[1] = 1; 469 swizzle[2] = 2; 470 swizzle[3] = 3; 471 break; 472 case BRW_OPCODE_DP4: 473 case BRW_OPCODE_DPH: /* FINISHME: DPH reads only three channels of src0, 474 * but all four of src1. 475 */ 476 swizzle[0] = 0; 477 swizzle[1] = 1; 478 swizzle[2] = 2; 479 swizzle[3] = 3; 480 break; 481 case BRW_OPCODE_DP3: 482 swizzle[0] = 0; 483 swizzle[1] = 1; 484 swizzle[2] = 2; 485 swizzle[3] = -1; 486 break; 487 case BRW_OPCODE_DP2: 488 swizzle[0] = 0; 489 swizzle[1] = 1; 490 swizzle[2] = -1; 491 swizzle[3] = -1; 492 break; 493 default: 494 swizzle[0] = inst->dst.writemask & WRITEMASK_X ? 0 : -1; 495 swizzle[1] = inst->dst.writemask & WRITEMASK_Y ? 1 : -1; 496 swizzle[2] = inst->dst.writemask & WRITEMASK_Z ? 2 : -1; 497 swizzle[3] = inst->dst.writemask & WRITEMASK_W ? 3 : -1; 498 break; 499 } 500 501 /* Resolve unread channels (-1) by assigning them the swizzle of the 502 * first channel that is used. 503 */ 504 int first_used_channel = 0; 505 for (int i = 0; i < 4; i++) { 506 if (swizzle[i] != -1) { 507 first_used_channel = swizzle[i]; 508 break; 509 } 510 } 511 for (int i = 0; i < 4; i++) { 512 if (swizzle[i] == -1) { 513 swizzle[i] = first_used_channel; 514 } 515 } 516 517 /* Update sources' swizzles. */ 518 for (int i = 0; i < 3; i++) { 519 if (inst->src[i].file != GRF && 520 inst->src[i].file != ATTR && 521 inst->src[i].file != UNIFORM) 522 continue; 523 524 int swiz[4]; 525 for (int j = 0; j < 4; j++) { 526 swiz[j] = BRW_GET_SWZ(inst->src[i].swizzle, swizzle[j]); 527 } 528 529 unsigned new_swizzle = BRW_SWIZZLE4(swiz[0], swiz[1], swiz[2], swiz[3]); 530 if (inst->src[i].swizzle != new_swizzle) { 531 inst->src[i].swizzle = new_swizzle; 532 progress = true; 533 } 534 } 535 } 536 537 if (progress) 538 invalidate_live_intervals(); 539 540 return progress; 541} 542 543void 544vec4_visitor::split_uniform_registers() 545{ 546 /* Prior to this, uniforms have been in an array sized according to 547 * the number of vector uniforms present, sparsely filled (so an 548 * aggregate results in reg indices being skipped over). Now we're 549 * going to cut those aggregates up so each .reg index is one 550 * vector. The goal is to make elimination of unused uniform 551 * components easier later. 552 */ 553 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 554 for (int i = 0 ; i < 3; i++) { 555 if (inst->src[i].file != UNIFORM) 556 continue; 557 558 assert(!inst->src[i].reladdr); 559 560 inst->src[i].reg += inst->src[i].reg_offset; 561 inst->src[i].reg_offset = 0; 562 } 563 } 564 565 /* Update that everything is now vector-sized. */ 566 for (int i = 0; i < this->uniforms; i++) { 567 this->uniform_size[i] = 1; 568 } 569} 570 571void 572vec4_visitor::pack_uniform_registers() 573{ 574 bool uniform_used[this->uniforms]; 575 int new_loc[this->uniforms]; 576 int new_chan[this->uniforms]; 577 578 memset(uniform_used, 0, sizeof(uniform_used)); 579 memset(new_loc, 0, sizeof(new_loc)); 580 memset(new_chan, 0, sizeof(new_chan)); 581 582 /* Find which uniform vectors are actually used by the program. We 583 * expect unused vector elements when we've moved array access out 584 * to pull constants, and from some GLSL code generators like wine. 585 */ 586 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 587 for (int i = 0 ; i < 3; i++) { 588 if (inst->src[i].file != UNIFORM) 589 continue; 590 591 uniform_used[inst->src[i].reg] = true; 592 } 593 } 594 595 int new_uniform_count = 0; 596 597 /* Now, figure out a packing of the live uniform vectors into our 598 * push constants. 599 */ 600 for (int src = 0; src < uniforms; src++) { 601 assert(src < uniform_array_size); 602 int size = this->uniform_vector_size[src]; 603 604 if (!uniform_used[src]) { 605 this->uniform_vector_size[src] = 0; 606 continue; 607 } 608 609 int dst; 610 /* Find the lowest place we can slot this uniform in. */ 611 for (dst = 0; dst < src; dst++) { 612 if (this->uniform_vector_size[dst] + size <= 4) 613 break; 614 } 615 616 if (src == dst) { 617 new_loc[src] = dst; 618 new_chan[src] = 0; 619 } else { 620 new_loc[src] = dst; 621 new_chan[src] = this->uniform_vector_size[dst]; 622 623 /* Move the references to the data */ 624 for (int j = 0; j < size; j++) { 625 stage_prog_data->param[dst * 4 + new_chan[src] + j] = 626 stage_prog_data->param[src * 4 + j]; 627 } 628 629 this->uniform_vector_size[dst] += size; 630 this->uniform_vector_size[src] = 0; 631 } 632 633 new_uniform_count = MAX2(new_uniform_count, dst + 1); 634 } 635 636 this->uniforms = new_uniform_count; 637 638 /* Now, update the instructions for our repacked uniforms. */ 639 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 640 for (int i = 0 ; i < 3; i++) { 641 int src = inst->src[i].reg; 642 643 if (inst->src[i].file != UNIFORM) 644 continue; 645 646 inst->src[i].reg = new_loc[src]; 647 648 int sx = BRW_GET_SWZ(inst->src[i].swizzle, 0) + new_chan[src]; 649 int sy = BRW_GET_SWZ(inst->src[i].swizzle, 1) + new_chan[src]; 650 int sz = BRW_GET_SWZ(inst->src[i].swizzle, 2) + new_chan[src]; 651 int sw = BRW_GET_SWZ(inst->src[i].swizzle, 3) + new_chan[src]; 652 inst->src[i].swizzle = BRW_SWIZZLE4(sx, sy, sz, sw); 653 } 654 } 655} 656 657/** 658 * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a). 659 * 660 * While GLSL IR also performs this optimization, we end up with it in 661 * our instruction stream for a couple of reasons. One is that we 662 * sometimes generate silly instructions, for example in array access 663 * where we'll generate "ADD offset, index, base" even if base is 0. 664 * The other is that GLSL IR's constant propagation doesn't track the 665 * components of aggregates, so some VS patterns (initialize matrix to 666 * 0, accumulate in vertex blending factors) end up breaking down to 667 * instructions involving 0. 668 */ 669bool 670vec4_visitor::opt_algebraic() 671{ 672 bool progress = false; 673 674 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 675 switch (inst->opcode) { 676 case BRW_OPCODE_MOV: 677 if (inst->src[0].file != IMM) 678 break; 679 680 if (inst->saturate) { 681 if (inst->dst.type != inst->src[0].type) 682 assert(!"unimplemented: saturate mixed types"); 683 684 if (brw_saturate_immediate(inst->dst.type, 685 &inst->src[0].fixed_hw_reg)) { 686 inst->saturate = false; 687 progress = true; 688 } 689 } 690 break; 691 692 case VEC4_OPCODE_UNPACK_UNIFORM: 693 if (inst->src[0].file != UNIFORM) { 694 inst->opcode = BRW_OPCODE_MOV; 695 progress = true; 696 } 697 break; 698 699 case BRW_OPCODE_ADD: 700 if (inst->src[1].is_zero()) { 701 inst->opcode = BRW_OPCODE_MOV; 702 inst->src[1] = src_reg(); 703 progress = true; 704 } 705 break; 706 707 case BRW_OPCODE_MUL: 708 if (inst->src[1].is_zero()) { 709 inst->opcode = BRW_OPCODE_MOV; 710 switch (inst->src[0].type) { 711 case BRW_REGISTER_TYPE_F: 712 inst->src[0] = src_reg(0.0f); 713 break; 714 case BRW_REGISTER_TYPE_D: 715 inst->src[0] = src_reg(0); 716 break; 717 case BRW_REGISTER_TYPE_UD: 718 inst->src[0] = src_reg(0u); 719 break; 720 default: 721 unreachable("not reached"); 722 } 723 inst->src[1] = src_reg(); 724 progress = true; 725 } else if (inst->src[1].is_one()) { 726 inst->opcode = BRW_OPCODE_MOV; 727 inst->src[1] = src_reg(); 728 progress = true; 729 } else if (inst->src[1].is_negative_one()) { 730 inst->opcode = BRW_OPCODE_MOV; 731 inst->src[0].negate = !inst->src[0].negate; 732 inst->src[1] = src_reg(); 733 progress = true; 734 } 735 break; 736 case BRW_OPCODE_CMP: 737 if (inst->conditional_mod == BRW_CONDITIONAL_GE && 738 inst->src[0].abs && 739 inst->src[0].negate && 740 inst->src[1].is_zero()) { 741 inst->src[0].abs = false; 742 inst->src[0].negate = false; 743 inst->conditional_mod = BRW_CONDITIONAL_Z; 744 progress = true; 745 break; 746 } 747 break; 748 case SHADER_OPCODE_RCP: { 749 vec4_instruction *prev = (vec4_instruction *)inst->prev; 750 if (prev->opcode == SHADER_OPCODE_SQRT) { 751 if (inst->src[0].equals(src_reg(prev->dst))) { 752 inst->opcode = SHADER_OPCODE_RSQ; 753 inst->src[0] = prev->src[0]; 754 progress = true; 755 } 756 } 757 break; 758 } 759 default: 760 break; 761 } 762 } 763 764 if (progress) 765 invalidate_live_intervals(); 766 767 return progress; 768} 769 770/** 771 * Only a limited number of hardware registers may be used for push 772 * constants, so this turns access to the overflowed constants into 773 * pull constants. 774 */ 775void 776vec4_visitor::move_push_constants_to_pull_constants() 777{ 778 int pull_constant_loc[this->uniforms]; 779 780 /* Only allow 32 registers (256 uniform components) as push constants, 781 * which is the limit on gen6. 782 * 783 * If changing this value, note the limitation about total_regs in 784 * brw_curbe.c. 785 */ 786 int max_uniform_components = 32 * 8; 787 if (this->uniforms * 4 <= max_uniform_components) 788 return; 789 790 /* Make some sort of choice as to which uniforms get sent to pull 791 * constants. We could potentially do something clever here like 792 * look for the most infrequently used uniform vec4s, but leave 793 * that for later. 794 */ 795 for (int i = 0; i < this->uniforms * 4; i += 4) { 796 pull_constant_loc[i / 4] = -1; 797 798 if (i >= max_uniform_components) { 799 const gl_constant_value **values = &stage_prog_data->param[i]; 800 801 /* Try to find an existing copy of this uniform in the pull 802 * constants if it was part of an array access already. 803 */ 804 for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j += 4) { 805 int matches; 806 807 for (matches = 0; matches < 4; matches++) { 808 if (stage_prog_data->pull_param[j + matches] != values[matches]) 809 break; 810 } 811 812 if (matches == 4) { 813 pull_constant_loc[i / 4] = j / 4; 814 break; 815 } 816 } 817 818 if (pull_constant_loc[i / 4] == -1) { 819 assert(stage_prog_data->nr_pull_params % 4 == 0); 820 pull_constant_loc[i / 4] = stage_prog_data->nr_pull_params / 4; 821 822 for (int j = 0; j < 4; j++) { 823 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] = 824 values[j]; 825 } 826 } 827 } 828 } 829 830 /* Now actually rewrite usage of the things we've moved to pull 831 * constants. 832 */ 833 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { 834 for (int i = 0 ; i < 3; i++) { 835 if (inst->src[i].file != UNIFORM || 836 pull_constant_loc[inst->src[i].reg] == -1) 837 continue; 838 839 int uniform = inst->src[i].reg; 840 841 dst_reg temp = dst_reg(this, glsl_type::vec4_type); 842 843 emit_pull_constant_load(block, inst, temp, inst->src[i], 844 pull_constant_loc[uniform]); 845 846 inst->src[i].file = temp.file; 847 inst->src[i].reg = temp.reg; 848 inst->src[i].reg_offset = temp.reg_offset; 849 inst->src[i].reladdr = NULL; 850 } 851 } 852 853 /* Repack push constants to remove the now-unused ones. */ 854 pack_uniform_registers(); 855} 856 857/* Conditions for which we want to avoid setting the dependency control bits */ 858bool 859vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst) 860{ 861#define IS_DWORD(reg) \ 862 (reg.type == BRW_REGISTER_TYPE_UD || \ 863 reg.type == BRW_REGISTER_TYPE_D) 864 865 /* "When source or destination datatype is 64b or operation is integer DWord 866 * multiply, DepCtrl must not be used." 867 * May apply to future SoCs as well. 868 */ 869 if (brw->is_cherryview) { 870 if (inst->opcode == BRW_OPCODE_MUL && 871 IS_DWORD(inst->src[0]) && 872 IS_DWORD(inst->src[1])) 873 return true; 874 } 875#undef IS_DWORD 876 877 if (brw->gen >= 8) { 878 if (inst->opcode == BRW_OPCODE_F32TO16) 879 return true; 880 } 881 882 /* 883 * mlen: 884 * In the presence of send messages, totally interrupt dependency 885 * control. They're long enough that the chance of dependency 886 * control around them just doesn't matter. 887 * 888 * predicate: 889 * From the Ivy Bridge PRM, volume 4 part 3.7, page 80: 890 * When a sequence of NoDDChk and NoDDClr are used, the last instruction that 891 * completes the scoreboard clear must have a non-zero execution mask. This 892 * means, if any kind of predication can change the execution mask or channel 893 * enable of the last instruction, the optimization must be avoided. This is 894 * to avoid instructions being shot down the pipeline when no writes are 895 * required. 896 * 897 * math: 898 * Dependency control does not work well over math instructions. 899 * NB: Discovered empirically 900 */ 901 return (inst->mlen || inst->predicate || inst->is_math()); 902} 903 904/** 905 * Sets the dependency control fields on instructions after register 906 * allocation and before the generator is run. 907 * 908 * When you have a sequence of instructions like: 909 * 910 * DP4 temp.x vertex uniform[0] 911 * DP4 temp.y vertex uniform[0] 912 * DP4 temp.z vertex uniform[0] 913 * DP4 temp.w vertex uniform[0] 914 * 915 * The hardware doesn't know that it can actually run the later instructions 916 * while the previous ones are in flight, producing stalls. However, we have 917 * manual fields we can set in the instructions that let it do so. 918 */ 919void 920vec4_visitor::opt_set_dependency_control() 921{ 922 vec4_instruction *last_grf_write[BRW_MAX_GRF]; 923 uint8_t grf_channels_written[BRW_MAX_GRF]; 924 vec4_instruction *last_mrf_write[BRW_MAX_GRF]; 925 uint8_t mrf_channels_written[BRW_MAX_GRF]; 926 927 assert(prog_data->total_grf || 928 !"Must be called after register allocation"); 929 930 foreach_block (block, cfg) { 931 memset(last_grf_write, 0, sizeof(last_grf_write)); 932 memset(last_mrf_write, 0, sizeof(last_mrf_write)); 933 934 foreach_inst_in_block (vec4_instruction, inst, block) { 935 /* If we read from a register that we were doing dependency control 936 * on, don't do dependency control across the read. 937 */ 938 for (int i = 0; i < 3; i++) { 939 int reg = inst->src[i].reg + inst->src[i].reg_offset; 940 if (inst->src[i].file == GRF) { 941 last_grf_write[reg] = NULL; 942 } else if (inst->src[i].file == HW_REG) { 943 memset(last_grf_write, 0, sizeof(last_grf_write)); 944 break; 945 } 946 assert(inst->src[i].file != MRF); 947 } 948 949 if (is_dep_ctrl_unsafe(inst)) { 950 memset(last_grf_write, 0, sizeof(last_grf_write)); 951 memset(last_mrf_write, 0, sizeof(last_mrf_write)); 952 continue; 953 } 954 955 /* Now, see if we can do dependency control for this instruction 956 * against a previous one writing to its destination. 957 */ 958 int reg = inst->dst.reg + inst->dst.reg_offset; 959 if (inst->dst.file == GRF) { 960 if (last_grf_write[reg] && 961 !(inst->dst.writemask & grf_channels_written[reg])) { 962 last_grf_write[reg]->no_dd_clear = true; 963 inst->no_dd_check = true; 964 } else { 965 grf_channels_written[reg] = 0; 966 } 967 968 last_grf_write[reg] = inst; 969 grf_channels_written[reg] |= inst->dst.writemask; 970 } else if (inst->dst.file == MRF) { 971 if (last_mrf_write[reg] && 972 !(inst->dst.writemask & mrf_channels_written[reg])) { 973 last_mrf_write[reg]->no_dd_clear = true; 974 inst->no_dd_check = true; 975 } else { 976 mrf_channels_written[reg] = 0; 977 } 978 979 last_mrf_write[reg] = inst; 980 mrf_channels_written[reg] |= inst->dst.writemask; 981 } else if (inst->dst.reg == HW_REG) { 982 if (inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) 983 memset(last_grf_write, 0, sizeof(last_grf_write)); 984 if (inst->dst.fixed_hw_reg.file == BRW_MESSAGE_REGISTER_FILE) 985 memset(last_mrf_write, 0, sizeof(last_mrf_write)); 986 } 987 } 988 } 989} 990 991bool 992vec4_instruction::can_reswizzle(int dst_writemask, 993 int swizzle, 994 int swizzle_mask) 995{ 996 /* If this instruction sets anything not referenced by swizzle, then we'd 997 * totally break it when we reswizzle. 998 */ 999 if (dst.writemask & ~swizzle_mask) 1000 return false; 1001 1002 if (mlen > 0) 1003 return false; 1004 1005 return true; 1006} 1007 1008/** 1009 * For any channels in the swizzle's source that were populated by this 1010 * instruction, rewrite the instruction to put the appropriate result directly 1011 * in those channels. 1012 * 1013 * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x 1014 */ 1015void 1016vec4_instruction::reswizzle(int dst_writemask, int swizzle) 1017{ 1018 int new_writemask = 0; 1019 int new_swizzle[4] = { 0 }; 1020 1021 /* Dot product instructions write a single result into all channels. */ 1022 if (opcode != BRW_OPCODE_DP4 && opcode != BRW_OPCODE_DPH && 1023 opcode != BRW_OPCODE_DP3 && opcode != BRW_OPCODE_DP2) { 1024 for (int i = 0; i < 3; i++) { 1025 if (src[i].file == BAD_FILE || src[i].file == IMM) 1026 continue; 1027 1028 /* Destination write mask doesn't correspond to source swizzle for the 1029 * pack_bytes instruction. 1030 */ 1031 if (opcode == VEC4_OPCODE_PACK_BYTES) 1032 continue; 1033 1034 for (int c = 0; c < 4; c++) { 1035 new_swizzle[c] = BRW_GET_SWZ(src[i].swizzle, BRW_GET_SWZ(swizzle, c)); 1036 } 1037 1038 src[i].swizzle = BRW_SWIZZLE4(new_swizzle[0], new_swizzle[1], 1039 new_swizzle[2], new_swizzle[3]); 1040 } 1041 } 1042 1043 for (int c = 0; c < 4; c++) { 1044 int bit = 1 << BRW_GET_SWZ(swizzle, c); 1045 /* Skip components of the swizzle not used by the dst. */ 1046 if (!(dst_writemask & (1 << c))) 1047 continue; 1048 /* If we were populating this component, then populate the 1049 * corresponding channel of the new dst. 1050 */ 1051 if (dst.writemask & bit) 1052 new_writemask |= (1 << c); 1053 } 1054 dst.writemask = new_writemask; 1055} 1056 1057/* 1058 * Tries to reduce extra MOV instructions by taking temporary GRFs that get 1059 * just written and then MOVed into another reg and making the original write 1060 * of the GRF write directly to the final destination instead. 1061 */ 1062bool 1063vec4_visitor::opt_register_coalesce() 1064{ 1065 bool progress = false; 1066 int next_ip = 0; 1067 1068 calculate_live_intervals(); 1069 1070 foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) { 1071 int ip = next_ip; 1072 next_ip++; 1073 1074 if (inst->opcode != BRW_OPCODE_MOV || 1075 (inst->dst.file != GRF && inst->dst.file != MRF) || 1076 inst->predicate || 1077 inst->src[0].file != GRF || 1078 inst->dst.type != inst->src[0].type || 1079 inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr) 1080 continue; 1081 1082 bool to_mrf = (inst->dst.file == MRF); 1083 1084 /* Can't coalesce this GRF if someone else was going to 1085 * read it later. 1086 */ 1087 if (this->virtual_grf_end[inst->src[0].reg * 4 + 0] > ip || 1088 this->virtual_grf_end[inst->src[0].reg * 4 + 1] > ip || 1089 this->virtual_grf_end[inst->src[0].reg * 4 + 2] > ip || 1090 this->virtual_grf_end[inst->src[0].reg * 4 + 3] > ip) 1091 continue; 1092 1093 /* We need to check interference with the final destination between this 1094 * instruction and the earliest instruction involved in writing the GRF 1095 * we're eliminating. To do that, keep track of which of our source 1096 * channels we've seen initialized. 1097 */ 1098 bool chans_needed[4] = {false, false, false, false}; 1099 int chans_remaining = 0; 1100 int swizzle_mask = 0; 1101 for (int i = 0; i < 4; i++) { 1102 int chan = BRW_GET_SWZ(inst->src[0].swizzle, i); 1103 1104 if (!(inst->dst.writemask & (1 << i))) 1105 continue; 1106 1107 swizzle_mask |= (1 << chan); 1108 1109 if (!chans_needed[chan]) { 1110 chans_needed[chan] = true; 1111 chans_remaining++; 1112 } 1113 } 1114 1115 /* Now walk up the instruction stream trying to see if we can rewrite 1116 * everything writing to the temporary to write into the destination 1117 * instead. 1118 */ 1119 vec4_instruction *_scan_inst = (vec4_instruction *)inst->prev; 1120 foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst, 1121 inst, block) { 1122 _scan_inst = scan_inst; 1123 1124 if (scan_inst->dst.file == GRF && 1125 scan_inst->dst.reg == inst->src[0].reg && 1126 scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 1127 /* Found something writing to the reg we want to coalesce away. */ 1128 if (to_mrf) { 1129 /* SEND instructions can't have MRF as a destination. */ 1130 if (scan_inst->mlen) 1131 break; 1132 1133 if (brw->gen == 6) { 1134 /* gen6 math instructions must have the destination be 1135 * GRF, so no compute-to-MRF for them. 1136 */ 1137 if (scan_inst->is_math()) { 1138 break; 1139 } 1140 } 1141 } 1142 1143 /* If we can't handle the swizzle, bail. */ 1144 if (!scan_inst->can_reswizzle(inst->dst.writemask, 1145 inst->src[0].swizzle, 1146 swizzle_mask)) { 1147 break; 1148 } 1149 1150 /* Mark which channels we found unconditional writes for. */ 1151 if (!scan_inst->predicate) { 1152 for (int i = 0; i < 4; i++) { 1153 if (scan_inst->dst.writemask & (1 << i) && 1154 chans_needed[i]) { 1155 chans_needed[i] = false; 1156 chans_remaining--; 1157 } 1158 } 1159 } 1160 1161 if (chans_remaining == 0) 1162 break; 1163 } 1164 1165 /* You can't read from an MRF, so if someone else reads our MRF's 1166 * source GRF that we wanted to rewrite, that stops us. If it's a 1167 * GRF we're trying to coalesce to, we don't actually handle 1168 * rewriting sources so bail in that case as well. 1169 */ 1170 bool interfered = false; 1171 for (int i = 0; i < 3; i++) { 1172 if (scan_inst->src[i].file == GRF && 1173 scan_inst->src[i].reg == inst->src[0].reg && 1174 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { 1175 interfered = true; 1176 } 1177 } 1178 if (interfered) 1179 break; 1180 1181 /* If somebody else writes our destination here, we can't coalesce 1182 * before that. 1183 */ 1184 if (scan_inst->dst.file == inst->dst.file && 1185 scan_inst->dst.reg == inst->dst.reg) { 1186 break; 1187 } 1188 1189 /* Check for reads of the register we're trying to coalesce into. We 1190 * can't go rewriting instructions above that to put some other value 1191 * in the register instead. 1192 */ 1193 if (to_mrf && scan_inst->mlen > 0) { 1194 if (inst->dst.reg >= scan_inst->base_mrf && 1195 inst->dst.reg < scan_inst->base_mrf + scan_inst->mlen) { 1196 break; 1197 } 1198 } else { 1199 for (int i = 0; i < 3; i++) { 1200 if (scan_inst->src[i].file == inst->dst.file && 1201 scan_inst->src[i].reg == inst->dst.reg && 1202 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { 1203 interfered = true; 1204 } 1205 } 1206 if (interfered) 1207 break; 1208 } 1209 } 1210 1211 if (chans_remaining == 0) { 1212 /* If we've made it here, we have an MOV we want to coalesce out, and 1213 * a scan_inst pointing to the earliest instruction involved in 1214 * computing the value. Now go rewrite the instruction stream 1215 * between the two. 1216 */ 1217 vec4_instruction *scan_inst = _scan_inst; 1218 while (scan_inst != inst) { 1219 if (scan_inst->dst.file == GRF && 1220 scan_inst->dst.reg == inst->src[0].reg && 1221 scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 1222 scan_inst->reswizzle(inst->dst.writemask, 1223 inst->src[0].swizzle); 1224 scan_inst->dst.file = inst->dst.file; 1225 scan_inst->dst.reg = inst->dst.reg; 1226 scan_inst->dst.reg_offset = inst->dst.reg_offset; 1227 scan_inst->saturate |= inst->saturate; 1228 } 1229 scan_inst = (vec4_instruction *)scan_inst->next; 1230 } 1231 inst->remove(block); 1232 progress = true; 1233 } 1234 } 1235 1236 if (progress) 1237 invalidate_live_intervals(); 1238 1239 return progress; 1240} 1241 1242/** 1243 * Splits virtual GRFs requesting more than one contiguous physical register. 1244 * 1245 * We initially create large virtual GRFs for temporary structures, arrays, 1246 * and matrices, so that the dereference visitor functions can add reg_offsets 1247 * to work their way down to the actual member being accessed. But when it 1248 * comes to optimization, we'd like to treat each register as individual 1249 * storage if possible. 1250 * 1251 * So far, the only thing that might prevent splitting is a send message from 1252 * a GRF on IVB. 1253 */ 1254void 1255vec4_visitor::split_virtual_grfs() 1256{ 1257 int num_vars = this->alloc.count; 1258 int new_virtual_grf[num_vars]; 1259 bool split_grf[num_vars]; 1260 1261 memset(new_virtual_grf, 0, sizeof(new_virtual_grf)); 1262 1263 /* Try to split anything > 0 sized. */ 1264 for (int i = 0; i < num_vars; i++) { 1265 split_grf[i] = this->alloc.sizes[i] != 1; 1266 } 1267 1268 /* Check that the instructions are compatible with the registers we're trying 1269 * to split. 1270 */ 1271 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 1272 /* If there's a SEND message loading from a GRF on gen7+, it needs to be 1273 * contiguous. 1274 */ 1275 if (inst->is_send_from_grf()) { 1276 for (int i = 0; i < 3; i++) { 1277 if (inst->src[i].file == GRF) { 1278 split_grf[inst->src[i].reg] = false; 1279 } 1280 } 1281 } 1282 } 1283 1284 /* Allocate new space for split regs. Note that the virtual 1285 * numbers will be contiguous. 1286 */ 1287 for (int i = 0; i < num_vars; i++) { 1288 if (!split_grf[i]) 1289 continue; 1290 1291 new_virtual_grf[i] = alloc.allocate(1); 1292 for (unsigned j = 2; j < this->alloc.sizes[i]; j++) { 1293 unsigned reg = alloc.allocate(1); 1294 assert(reg == new_virtual_grf[i] + j - 1); 1295 (void) reg; 1296 } 1297 this->alloc.sizes[i] = 1; 1298 } 1299 1300 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 1301 if (inst->dst.file == GRF && split_grf[inst->dst.reg] && 1302 inst->dst.reg_offset != 0) { 1303 inst->dst.reg = (new_virtual_grf[inst->dst.reg] + 1304 inst->dst.reg_offset - 1); 1305 inst->dst.reg_offset = 0; 1306 } 1307 for (int i = 0; i < 3; i++) { 1308 if (inst->src[i].file == GRF && split_grf[inst->src[i].reg] && 1309 inst->src[i].reg_offset != 0) { 1310 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] + 1311 inst->src[i].reg_offset - 1); 1312 inst->src[i].reg_offset = 0; 1313 } 1314 } 1315 } 1316 invalidate_live_intervals(); 1317} 1318 1319void 1320vec4_visitor::dump_instruction(backend_instruction *be_inst) 1321{ 1322 dump_instruction(be_inst, stderr); 1323} 1324 1325void 1326vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) 1327{ 1328 vec4_instruction *inst = (vec4_instruction *)be_inst; 1329 1330 if (inst->predicate) { 1331 fprintf(file, "(%cf0.%d) ", 1332 inst->predicate_inverse ? '-' : '+', 1333 inst->flag_subreg); 1334 } 1335 1336 fprintf(file, "%s", brw_instruction_name(inst->opcode)); 1337 if (inst->conditional_mod) { 1338 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]); 1339 if (!inst->predicate && 1340 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL && 1341 inst->opcode != BRW_OPCODE_IF && 1342 inst->opcode != BRW_OPCODE_WHILE))) { 1343 fprintf(file, ".f0.%d", inst->flag_subreg); 1344 } 1345 } 1346 fprintf(file, " "); 1347 1348 switch (inst->dst.file) { 1349 case GRF: 1350 fprintf(file, "vgrf%d.%d", inst->dst.reg, inst->dst.reg_offset); 1351 break; 1352 case MRF: 1353 fprintf(file, "m%d", inst->dst.reg); 1354 break; 1355 case HW_REG: 1356 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) { 1357 switch (inst->dst.fixed_hw_reg.nr) { 1358 case BRW_ARF_NULL: 1359 fprintf(file, "null"); 1360 break; 1361 case BRW_ARF_ADDRESS: 1362 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr); 1363 break; 1364 case BRW_ARF_ACCUMULATOR: 1365 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr); 1366 break; 1367 case BRW_ARF_FLAG: 1368 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf, 1369 inst->dst.fixed_hw_reg.subnr); 1370 break; 1371 default: 1372 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf, 1373 inst->dst.fixed_hw_reg.subnr); 1374 break; 1375 } 1376 } else { 1377 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr); 1378 } 1379 if (inst->dst.fixed_hw_reg.subnr) 1380 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr); 1381 break; 1382 case BAD_FILE: 1383 fprintf(file, "(null)"); 1384 break; 1385 default: 1386 fprintf(file, "???"); 1387 break; 1388 } 1389 if (inst->dst.writemask != WRITEMASK_XYZW) { 1390 fprintf(file, "."); 1391 if (inst->dst.writemask & 1) 1392 fprintf(file, "x"); 1393 if (inst->dst.writemask & 2) 1394 fprintf(file, "y"); 1395 if (inst->dst.writemask & 4) 1396 fprintf(file, "z"); 1397 if (inst->dst.writemask & 8) 1398 fprintf(file, "w"); 1399 } 1400 fprintf(file, ":%s", brw_reg_type_letters(inst->dst.type)); 1401 1402 if (inst->src[0].file != BAD_FILE) 1403 fprintf(file, ", "); 1404 1405 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) { 1406 if (inst->src[i].negate) 1407 fprintf(file, "-"); 1408 if (inst->src[i].abs) 1409 fprintf(file, "|"); 1410 switch (inst->src[i].file) { 1411 case GRF: 1412 fprintf(file, "vgrf%d", inst->src[i].reg); 1413 break; 1414 case ATTR: 1415 fprintf(file, "attr%d", inst->src[i].reg); 1416 break; 1417 case UNIFORM: 1418 fprintf(file, "u%d", inst->src[i].reg); 1419 break; 1420 case IMM: 1421 switch (inst->src[i].type) { 1422 case BRW_REGISTER_TYPE_F: 1423 fprintf(file, "%fF", inst->src[i].fixed_hw_reg.dw1.f); 1424 break; 1425 case BRW_REGISTER_TYPE_D: 1426 fprintf(file, "%dD", inst->src[i].fixed_hw_reg.dw1.d); 1427 break; 1428 case BRW_REGISTER_TYPE_UD: 1429 fprintf(file, "%uU", inst->src[i].fixed_hw_reg.dw1.ud); 1430 break; 1431 case BRW_REGISTER_TYPE_VF: 1432 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]", 1433 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff), 1434 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff), 1435 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff), 1436 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff)); 1437 break; 1438 default: 1439 fprintf(file, "???"); 1440 break; 1441 } 1442 break; 1443 case HW_REG: 1444 if (inst->src[i].fixed_hw_reg.negate) 1445 fprintf(file, "-"); 1446 if (inst->src[i].fixed_hw_reg.abs) 1447 fprintf(file, "|"); 1448 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) { 1449 switch (inst->src[i].fixed_hw_reg.nr) { 1450 case BRW_ARF_NULL: 1451 fprintf(file, "null"); 1452 break; 1453 case BRW_ARF_ADDRESS: 1454 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr); 1455 break; 1456 case BRW_ARF_ACCUMULATOR: 1457 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr); 1458 break; 1459 case BRW_ARF_FLAG: 1460 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf, 1461 inst->src[i].fixed_hw_reg.subnr); 1462 break; 1463 default: 1464 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf, 1465 inst->src[i].fixed_hw_reg.subnr); 1466 break; 1467 } 1468 } else { 1469 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr); 1470 } 1471 if (inst->src[i].fixed_hw_reg.subnr) 1472 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr); 1473 if (inst->src[i].fixed_hw_reg.abs) 1474 fprintf(file, "|"); 1475 break; 1476 case BAD_FILE: 1477 fprintf(file, "(null)"); 1478 break; 1479 default: 1480 fprintf(file, "???"); 1481 break; 1482 } 1483 1484 /* Don't print .0; and only VGRFs have reg_offsets and sizes */ 1485 if (inst->src[i].reg_offset != 0 && 1486 inst->src[i].file == GRF && 1487 alloc.sizes[inst->src[i].reg] != 1) 1488 fprintf(file, ".%d", inst->src[i].reg_offset); 1489 1490 if (inst->src[i].file != IMM) { 1491 static const char *chans[4] = {"x", "y", "z", "w"}; 1492 fprintf(file, "."); 1493 for (int c = 0; c < 4; c++) { 1494 fprintf(file, "%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]); 1495 } 1496 } 1497 1498 if (inst->src[i].abs) 1499 fprintf(file, "|"); 1500 1501 if (inst->src[i].file != IMM) { 1502 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type)); 1503 } 1504 1505 if (i < 2 && inst->src[i + 1].file != BAD_FILE) 1506 fprintf(file, ", "); 1507 } 1508 1509 fprintf(file, "\n"); 1510} 1511 1512 1513static inline struct brw_reg 1514attribute_to_hw_reg(int attr, bool interleaved) 1515{ 1516 if (interleaved) 1517 return stride(brw_vec4_grf(attr / 2, (attr % 2) * 4), 0, 4, 1); 1518 else 1519 return brw_vec8_grf(attr, 0); 1520} 1521 1522 1523/** 1524 * Replace each register of type ATTR in this->instructions with a reference 1525 * to a fixed HW register. 1526 * 1527 * If interleaved is true, then each attribute takes up half a register, with 1528 * register N containing attribute 2*N in its first half and attribute 2*N+1 1529 * in its second half (this corresponds to the payload setup used by geometry 1530 * shaders in "single" or "dual instanced" dispatch mode). If interleaved is 1531 * false, then each attribute takes up a whole register, with register N 1532 * containing attribute N (this corresponds to the payload setup used by 1533 * vertex shaders, and by geometry shaders in "dual object" dispatch mode). 1534 */ 1535void 1536vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map, 1537 bool interleaved) 1538{ 1539 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 1540 /* We have to support ATTR as a destination for GL_FIXED fixup. */ 1541 if (inst->dst.file == ATTR) { 1542 int grf = attribute_map[inst->dst.reg + inst->dst.reg_offset]; 1543 1544 /* All attributes used in the shader need to have been assigned a 1545 * hardware register by the caller 1546 */ 1547 assert(grf != 0); 1548 1549 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved); 1550 reg.type = inst->dst.type; 1551 reg.dw1.bits.writemask = inst->dst.writemask; 1552 1553 inst->dst.file = HW_REG; 1554 inst->dst.fixed_hw_reg = reg; 1555 } 1556 1557 for (int i = 0; i < 3; i++) { 1558 if (inst->src[i].file != ATTR) 1559 continue; 1560 1561 int grf = attribute_map[inst->src[i].reg + inst->src[i].reg_offset]; 1562 1563 /* All attributes used in the shader need to have been assigned a 1564 * hardware register by the caller 1565 */ 1566 assert(grf != 0); 1567 1568 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved); 1569 reg.dw1.bits.swizzle = inst->src[i].swizzle; 1570 reg.type = inst->src[i].type; 1571 if (inst->src[i].abs) 1572 reg = brw_abs(reg); 1573 if (inst->src[i].negate) 1574 reg = negate(reg); 1575 1576 inst->src[i].file = HW_REG; 1577 inst->src[i].fixed_hw_reg = reg; 1578 } 1579 } 1580} 1581 1582int 1583vec4_vs_visitor::setup_attributes(int payload_reg) 1584{ 1585 int nr_attributes; 1586 int attribute_map[VERT_ATTRIB_MAX + 1]; 1587 memset(attribute_map, 0, sizeof(attribute_map)); 1588 1589 nr_attributes = 0; 1590 for (int i = 0; i < VERT_ATTRIB_MAX; i++) { 1591 if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) { 1592 attribute_map[i] = payload_reg + nr_attributes; 1593 nr_attributes++; 1594 } 1595 } 1596 1597 /* VertexID is stored by the VF as the last vertex element, but we 1598 * don't represent it with a flag in inputs_read, so we call it 1599 * VERT_ATTRIB_MAX. 1600 */ 1601 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) { 1602 attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes; 1603 nr_attributes++; 1604 } 1605 1606 lower_attributes_to_hw_regs(attribute_map, false /* interleaved */); 1607 1608 /* The BSpec says we always have to read at least one thing from 1609 * the VF, and it appears that the hardware wedges otherwise. 1610 */ 1611 if (nr_attributes == 0) 1612 nr_attributes = 1; 1613 1614 prog_data->urb_read_length = (nr_attributes + 1) / 2; 1615 1616 unsigned vue_entries = 1617 MAX2(nr_attributes, prog_data->vue_map.num_slots); 1618 1619 if (brw->gen == 6) 1620 prog_data->urb_entry_size = ALIGN(vue_entries, 8) / 8; 1621 else 1622 prog_data->urb_entry_size = ALIGN(vue_entries, 4) / 4; 1623 1624 return payload_reg + nr_attributes; 1625} 1626 1627int 1628vec4_visitor::setup_uniforms(int reg) 1629{ 1630 prog_data->base.dispatch_grf_start_reg = reg; 1631 1632 /* The pre-gen6 VS requires that some push constants get loaded no 1633 * matter what, or the GPU would hang. 1634 */ 1635 if (brw->gen < 6 && this->uniforms == 0) { 1636 assert(this->uniforms < this->uniform_array_size); 1637 this->uniform_vector_size[this->uniforms] = 1; 1638 1639 stage_prog_data->param = 1640 reralloc(NULL, stage_prog_data->param, const gl_constant_value *, 4); 1641 for (unsigned int i = 0; i < 4; i++) { 1642 unsigned int slot = this->uniforms * 4 + i; 1643 static gl_constant_value zero = { 0.0 }; 1644 stage_prog_data->param[slot] = &zero; 1645 } 1646 1647 this->uniforms++; 1648 reg++; 1649 } else { 1650 reg += ALIGN(uniforms, 2) / 2; 1651 } 1652 1653 stage_prog_data->nr_params = this->uniforms * 4; 1654 1655 prog_data->base.curb_read_length = 1656 reg - prog_data->base.dispatch_grf_start_reg; 1657 1658 return reg; 1659} 1660 1661void 1662vec4_vs_visitor::setup_payload(void) 1663{ 1664 int reg = 0; 1665 1666 /* The payload always contains important data in g0, which contains 1667 * the URB handles that are passed on to the URB write at the end 1668 * of the thread. So, we always start push constants at g1. 1669 */ 1670 reg++; 1671 1672 reg = setup_uniforms(reg); 1673 1674 reg = setup_attributes(reg); 1675 1676 this->first_non_payload_grf = reg; 1677} 1678 1679void 1680vec4_visitor::assign_binding_table_offsets() 1681{ 1682 assign_common_binding_table_offsets(0); 1683} 1684 1685src_reg 1686vec4_visitor::get_timestamp() 1687{ 1688 assert(brw->gen >= 7); 1689 1690 src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE, 1691 BRW_ARF_TIMESTAMP, 1692 0, 1693 0, 1694 0, 1695 BRW_REGISTER_TYPE_UD, 1696 BRW_VERTICAL_STRIDE_0, 1697 BRW_WIDTH_4, 1698 BRW_HORIZONTAL_STRIDE_4, 1699 BRW_SWIZZLE_XYZW, 1700 WRITEMASK_XYZW)); 1701 1702 dst_reg dst = dst_reg(this, glsl_type::uvec4_type); 1703 1704 vec4_instruction *mov = emit(MOV(dst, ts)); 1705 /* We want to read the 3 fields we care about (mostly field 0, but also 2) 1706 * even if it's not enabled in the dispatch. 1707 */ 1708 mov->force_writemask_all = true; 1709 1710 return src_reg(dst); 1711} 1712 1713void 1714vec4_visitor::emit_shader_time_begin() 1715{ 1716 current_annotation = "shader time start"; 1717 shader_start_time = get_timestamp(); 1718} 1719 1720void 1721vec4_visitor::emit_shader_time_end() 1722{ 1723 current_annotation = "shader time end"; 1724 src_reg shader_end_time = get_timestamp(); 1725 1726 1727 /* Check that there weren't any timestamp reset events (assuming these 1728 * were the only two timestamp reads that happened). 1729 */ 1730 src_reg reset_end = shader_end_time; 1731 reset_end.swizzle = BRW_SWIZZLE_ZZZZ; 1732 vec4_instruction *test = emit(AND(dst_null_d(), reset_end, src_reg(1u))); 1733 test->conditional_mod = BRW_CONDITIONAL_Z; 1734 1735 emit(IF(BRW_PREDICATE_NORMAL)); 1736 1737 /* Take the current timestamp and get the delta. */ 1738 shader_start_time.negate = true; 1739 dst_reg diff = dst_reg(this, glsl_type::uint_type); 1740 emit(ADD(diff, shader_start_time, shader_end_time)); 1741 1742 /* If there were no instructions between the two timestamp gets, the diff 1743 * is 2 cycles. Remove that overhead, so I can forget about that when 1744 * trying to determine the time taken for single instructions. 1745 */ 1746 emit(ADD(diff, src_reg(diff), src_reg(-2u))); 1747 1748 emit_shader_time_write(st_base, src_reg(diff)); 1749 emit_shader_time_write(st_written, src_reg(1u)); 1750 emit(BRW_OPCODE_ELSE); 1751 emit_shader_time_write(st_reset, src_reg(1u)); 1752 emit(BRW_OPCODE_ENDIF); 1753} 1754 1755void 1756vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type, 1757 src_reg value) 1758{ 1759 int shader_time_index = 1760 brw_get_shader_time_index(brw, shader_prog, prog, type); 1761 1762 dst_reg dst = 1763 dst_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 2)); 1764 1765 dst_reg offset = dst; 1766 dst_reg time = dst; 1767 time.reg_offset++; 1768 1769 offset.type = BRW_REGISTER_TYPE_UD; 1770 emit(MOV(offset, src_reg(shader_time_index * SHADER_TIME_STRIDE))); 1771 1772 time.type = BRW_REGISTER_TYPE_UD; 1773 emit(MOV(time, src_reg(value))); 1774 1775 vec4_instruction *inst = 1776 emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(dst)); 1777 inst->mlen = 2; 1778} 1779 1780bool 1781vec4_visitor::run() 1782{ 1783 sanity_param_count = prog->Parameters->NumParameters; 1784 1785 if (INTEL_DEBUG & DEBUG_SHADER_TIME) 1786 emit_shader_time_begin(); 1787 1788 assign_binding_table_offsets(); 1789 1790 emit_prolog(); 1791 1792 /* Generate VS IR for main(). (the visitor only descends into 1793 * functions called "main"). 1794 */ 1795 if (shader) { 1796 visit_instructions(shader->base.ir); 1797 } else { 1798 emit_program_code(); 1799 } 1800 base_ir = NULL; 1801 1802 if (key->userclip_active && !prog->UsesClipDistanceOut) 1803 setup_uniform_clipplane_values(); 1804 1805 emit_thread_end(); 1806 1807 calculate_cfg(); 1808 1809 /* Before any optimization, push array accesses out to scratch 1810 * space where we need them to be. This pass may allocate new 1811 * virtual GRFs, so we want to do it early. It also makes sure 1812 * that we have reladdr computations available for CSE, since we'll 1813 * often do repeated subexpressions for those. 1814 */ 1815 if (shader) { 1816 move_grf_array_access_to_scratch(); 1817 move_uniform_array_access_to_pull_constants(); 1818 } else { 1819 /* The ARB_vertex_program frontend emits pull constant loads directly 1820 * rather than using reladdr, so we don't need to walk through all the 1821 * instructions looking for things to move. There isn't anything. 1822 * 1823 * We do still need to split things to vec4 size. 1824 */ 1825 split_uniform_registers(); 1826 } 1827 pack_uniform_registers(); 1828 move_push_constants_to_pull_constants(); 1829 split_virtual_grfs(); 1830 1831 const char *stage_name = stage == MESA_SHADER_GEOMETRY ? "gs" : "vs"; 1832 1833#define OPT(pass, args...) ({ \ 1834 pass_num++; \ 1835 bool this_progress = pass(args); \ 1836 \ 1837 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \ 1838 char filename[64]; \ 1839 snprintf(filename, 64, "%s-%04d-%02d-%02d-" #pass, \ 1840 stage_name, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \ 1841 \ 1842 backend_visitor::dump_instructions(filename); \ 1843 } \ 1844 \ 1845 progress = progress || this_progress; \ 1846 this_progress; \ 1847 }) 1848 1849 1850 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) { 1851 char filename[64]; 1852 snprintf(filename, 64, "%s-%04d-00-start", 1853 stage_name, shader_prog ? shader_prog->Name : 0); 1854 1855 backend_visitor::dump_instructions(filename); 1856 } 1857 1858 bool progress; 1859 int iteration = 0; 1860 int pass_num = 0; 1861 do { 1862 progress = false; 1863 pass_num = 0; 1864 iteration++; 1865 1866 OPT(opt_reduce_swizzle); 1867 OPT(dead_code_eliminate); 1868 OPT(dead_control_flow_eliminate, this); 1869 OPT(opt_copy_propagation); 1870 OPT(opt_cse); 1871 OPT(opt_algebraic); 1872 OPT(opt_register_coalesce); 1873 } while (progress); 1874 1875 pass_num = 0; 1876 1877 if (OPT(opt_vector_float)) { 1878 OPT(opt_cse); 1879 OPT(opt_copy_propagation, false); 1880 OPT(opt_copy_propagation, true); 1881 OPT(dead_code_eliminate); 1882 } 1883 1884 if (failed) 1885 return false; 1886 1887 setup_payload(); 1888 1889 if (false) { 1890 /* Debug of register spilling: Go spill everything. */ 1891 const int grf_count = alloc.count; 1892 float spill_costs[alloc.count]; 1893 bool no_spill[alloc.count]; 1894 evaluate_spill_costs(spill_costs, no_spill); 1895 for (int i = 0; i < grf_count; i++) { 1896 if (no_spill[i]) 1897 continue; 1898 spill_reg(i); 1899 } 1900 } 1901 1902 while (!reg_allocate()) { 1903 if (failed) 1904 return false; 1905 } 1906 1907 opt_schedule_instructions(); 1908 1909 opt_set_dependency_control(); 1910 1911 /* If any state parameters were appended, then ParameterValues could have 1912 * been realloced, in which case the driver uniform storage set up by 1913 * _mesa_associate_uniform_storage() would point to freed memory. Make 1914 * sure that didn't happen. 1915 */ 1916 assert(sanity_param_count == prog->Parameters->NumParameters); 1917 1918 return !failed; 1919} 1920 1921} /* namespace brw */ 1922 1923extern "C" { 1924 1925/** 1926 * Compile a vertex shader. 1927 * 1928 * Returns the final assembly and the program's size. 1929 */ 1930const unsigned * 1931brw_vs_emit(struct brw_context *brw, 1932 struct gl_shader_program *prog, 1933 struct brw_vs_compile *c, 1934 struct brw_vs_prog_data *prog_data, 1935 void *mem_ctx, 1936 unsigned *final_assembly_size) 1937{ 1938 bool start_busy = false; 1939 double start_time = 0; 1940 const unsigned *assembly = NULL; 1941 1942 if (unlikely(brw->perf_debug)) { 1943 start_busy = (brw->batch.last_bo && 1944 drm_intel_bo_busy(brw->batch.last_bo)); 1945 start_time = get_time(); 1946 } 1947 1948 struct brw_shader *shader = NULL; 1949 if (prog) 1950 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_VERTEX]; 1951 1952 if (unlikely(INTEL_DEBUG & DEBUG_VS)) 1953 brw_dump_ir("vertex", prog, &shader->base, &c->vp->program.Base); 1954 1955 if (prog && brw->gen >= 8 && brw->scalar_vs) { 1956 fs_visitor v(brw, mem_ctx, &c->key, prog_data, prog, &c->vp->program, 8); 1957 if (!v.run_vs()) { 1958 if (prog) { 1959 prog->LinkStatus = false; 1960 ralloc_strcat(&prog->InfoLog, v.fail_msg); 1961 } 1962 1963 _mesa_problem(NULL, "Failed to compile vertex shader: %s\n", 1964 v.fail_msg); 1965 1966 return NULL; 1967 } 1968 1969 fs_generator g(brw, mem_ctx, (void *) &c->key, &prog_data->base.base, 1970 &c->vp->program.Base, v.runtime_check_aads_emit, "VS"); 1971 if (INTEL_DEBUG & DEBUG_VS) { 1972 char *name = ralloc_asprintf(mem_ctx, "%s vertex shader %d", 1973 prog->Label ? prog->Label : "unnamed", 1974 prog->Name); 1975 g.enable_debug(name); 1976 } 1977 g.generate_code(v.cfg, 8); 1978 assembly = g.get_assembly(final_assembly_size); 1979 1980 if (assembly) 1981 prog_data->base.simd8 = true; 1982 c->base.last_scratch = v.last_scratch; 1983 } 1984 1985 if (!assembly) { 1986 vec4_vs_visitor v(brw, c, prog_data, prog, mem_ctx); 1987 if (!v.run()) { 1988 if (prog) { 1989 prog->LinkStatus = false; 1990 ralloc_strcat(&prog->InfoLog, v.fail_msg); 1991 } 1992 1993 _mesa_problem(NULL, "Failed to compile vertex shader: %s\n", 1994 v.fail_msg); 1995 1996 return NULL; 1997 } 1998 1999 vec4_generator g(brw, prog, &c->vp->program.Base, &prog_data->base, 2000 mem_ctx, INTEL_DEBUG & DEBUG_VS, "vertex", "VS"); 2001 assembly = g.generate_assembly(v.cfg, final_assembly_size); 2002 } 2003 2004 if (unlikely(brw->perf_debug) && shader) { 2005 if (shader->compiled_once) { 2006 brw_vs_debug_recompile(brw, prog, &c->key); 2007 } 2008 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) { 2009 perf_debug("VS compile took %.03f ms and stalled the GPU\n", 2010 (get_time() - start_time) * 1000); 2011 } 2012 shader->compiled_once = true; 2013 } 2014 2015 return assembly; 2016} 2017 2018 2019void 2020brw_vue_setup_prog_key_for_precompile(struct gl_context *ctx, 2021 struct brw_vue_prog_key *key, 2022 GLuint id, struct gl_program *prog) 2023{ 2024 struct brw_context *brw = brw_context(ctx); 2025 key->program_string_id = id; 2026 2027 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8; 2028 unsigned sampler_count = _mesa_fls(prog->SamplersUsed); 2029 for (unsigned i = 0; i < sampler_count; i++) { 2030 if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) { 2031 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */ 2032 key->tex.swizzles[i] = 2033 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE); 2034 } else { 2035 /* Color sampler: assume no swizzling. */ 2036 key->tex.swizzles[i] = SWIZZLE_XYZW; 2037 } 2038 } 2039} 2040 2041} /* extern "C" */ 2042