brw_vec4.cpp revision ae8b066da5862b4cfc510b3a9a0e1273f9f6edd4
1/* 2 * Copyright © 2011 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "brw_vec4.h" 25#include "brw_cfg.h" 26#include "brw_vs.h" 27#include "brw_dead_control_flow.h" 28 29extern "C" { 30#include "main/macros.h" 31#include "main/shaderobj.h" 32#include "program/prog_print.h" 33#include "program/prog_parameter.h" 34} 35 36#define MAX_INSTRUCTION (1 << 30) 37 38using namespace brw; 39 40namespace brw { 41 42/** 43 * Common helper for constructing swizzles. When only a subset of 44 * channels of a vec4 are used, we don't want to reference the other 45 * channels, as that will tell optimization passes that those other 46 * channels are used. 47 */ 48unsigned 49swizzle_for_size(int size) 50{ 51 static const unsigned size_swizzles[4] = { 52 BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X), 53 BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y), 54 BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z), 55 BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W), 56 }; 57 58 assert((size >= 1) && (size <= 4)); 59 return size_swizzles[size - 1]; 60} 61 62void 63src_reg::init() 64{ 65 memset(this, 0, sizeof(*this)); 66 67 this->file = BAD_FILE; 68} 69 70src_reg::src_reg(register_file file, int reg, const glsl_type *type) 71{ 72 init(); 73 74 this->file = file; 75 this->reg = reg; 76 if (type && (type->is_scalar() || type->is_vector() || type->is_matrix())) 77 this->swizzle = swizzle_for_size(type->vector_elements); 78 else 79 this->swizzle = BRW_SWIZZLE_XYZW; 80} 81 82/** Generic unset register constructor. */ 83src_reg::src_reg() 84{ 85 init(); 86} 87 88src_reg::src_reg(float f) 89{ 90 init(); 91 92 this->file = IMM; 93 this->type = BRW_REGISTER_TYPE_F; 94 this->imm.f = f; 95} 96 97src_reg::src_reg(uint32_t u) 98{ 99 init(); 100 101 this->file = IMM; 102 this->type = BRW_REGISTER_TYPE_UD; 103 this->imm.u = u; 104} 105 106src_reg::src_reg(int32_t i) 107{ 108 init(); 109 110 this->file = IMM; 111 this->type = BRW_REGISTER_TYPE_D; 112 this->imm.i = i; 113} 114 115src_reg::src_reg(struct brw_reg reg) 116{ 117 init(); 118 119 this->file = HW_REG; 120 this->fixed_hw_reg = reg; 121} 122 123src_reg::src_reg(dst_reg reg) 124{ 125 init(); 126 127 this->file = reg.file; 128 this->reg = reg.reg; 129 this->reg_offset = reg.reg_offset; 130 this->type = reg.type; 131 this->reladdr = reg.reladdr; 132 this->fixed_hw_reg = reg.fixed_hw_reg; 133 134 int swizzles[4]; 135 int next_chan = 0; 136 int last = 0; 137 138 for (int i = 0; i < 4; i++) { 139 if (!(reg.writemask & (1 << i))) 140 continue; 141 142 swizzles[next_chan++] = last = i; 143 } 144 145 for (; next_chan < 4; next_chan++) { 146 swizzles[next_chan] = last; 147 } 148 149 this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1], 150 swizzles[2], swizzles[3]); 151} 152 153void 154dst_reg::init() 155{ 156 memset(this, 0, sizeof(*this)); 157 this->file = BAD_FILE; 158 this->writemask = WRITEMASK_XYZW; 159} 160 161dst_reg::dst_reg() 162{ 163 init(); 164} 165 166dst_reg::dst_reg(register_file file, int reg) 167{ 168 init(); 169 170 this->file = file; 171 this->reg = reg; 172} 173 174dst_reg::dst_reg(register_file file, int reg, const glsl_type *type, 175 int writemask) 176{ 177 init(); 178 179 this->file = file; 180 this->reg = reg; 181 this->type = brw_type_for_base_type(type); 182 this->writemask = writemask; 183} 184 185dst_reg::dst_reg(struct brw_reg reg) 186{ 187 init(); 188 189 this->file = HW_REG; 190 this->fixed_hw_reg = reg; 191} 192 193dst_reg::dst_reg(src_reg reg) 194{ 195 init(); 196 197 this->file = reg.file; 198 this->reg = reg.reg; 199 this->reg_offset = reg.reg_offset; 200 this->type = reg.type; 201 /* How should we do writemasking when converting from a src_reg? It seems 202 * pretty obvious that for src.xxxx the caller wants to write to src.x, but 203 * what about for src.wx? Just special-case src.xxxx for now. 204 */ 205 if (reg.swizzle == BRW_SWIZZLE_XXXX) 206 this->writemask = WRITEMASK_X; 207 else 208 this->writemask = WRITEMASK_XYZW; 209 this->reladdr = reg.reladdr; 210 this->fixed_hw_reg = reg.fixed_hw_reg; 211} 212 213bool 214vec4_instruction::is_send_from_grf() 215{ 216 switch (opcode) { 217 case SHADER_OPCODE_SHADER_TIME_ADD: 218 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: 219 return true; 220 default: 221 return false; 222 } 223} 224 225bool 226vec4_visitor::can_do_source_mods(vec4_instruction *inst) 227{ 228 if (brw->gen == 6 && inst->is_math()) 229 return false; 230 231 if (inst->is_send_from_grf()) 232 return false; 233 234 if (!inst->can_do_source_mods()) 235 return false; 236 237 return true; 238} 239 240/** 241 * Returns how many MRFs an opcode will write over. 242 * 243 * Note that this is not the 0 or 1 implied writes in an actual gen 244 * instruction -- the generate_* functions generate additional MOVs 245 * for setup. 246 */ 247int 248vec4_visitor::implied_mrf_writes(vec4_instruction *inst) 249{ 250 if (inst->mlen == 0) 251 return 0; 252 253 switch (inst->opcode) { 254 case SHADER_OPCODE_RCP: 255 case SHADER_OPCODE_RSQ: 256 case SHADER_OPCODE_SQRT: 257 case SHADER_OPCODE_EXP2: 258 case SHADER_OPCODE_LOG2: 259 case SHADER_OPCODE_SIN: 260 case SHADER_OPCODE_COS: 261 return 1; 262 case SHADER_OPCODE_INT_QUOTIENT: 263 case SHADER_OPCODE_INT_REMAINDER: 264 case SHADER_OPCODE_POW: 265 return 2; 266 case VS_OPCODE_URB_WRITE: 267 return 1; 268 case VS_OPCODE_PULL_CONSTANT_LOAD: 269 return 2; 270 case SHADER_OPCODE_GEN4_SCRATCH_READ: 271 return 2; 272 case SHADER_OPCODE_GEN4_SCRATCH_WRITE: 273 return 3; 274 case GS_OPCODE_URB_WRITE: 275 case GS_OPCODE_THREAD_END: 276 return 0; 277 case SHADER_OPCODE_SHADER_TIME_ADD: 278 return 0; 279 case SHADER_OPCODE_TEX: 280 case SHADER_OPCODE_TXL: 281 case SHADER_OPCODE_TXD: 282 case SHADER_OPCODE_TXF: 283 case SHADER_OPCODE_TXF_CMS: 284 case SHADER_OPCODE_TXF_MCS: 285 case SHADER_OPCODE_TXS: 286 case SHADER_OPCODE_TG4: 287 case SHADER_OPCODE_TG4_OFFSET: 288 return inst->header_present ? 1 : 0; 289 case SHADER_OPCODE_UNTYPED_ATOMIC: 290 case SHADER_OPCODE_UNTYPED_SURFACE_READ: 291 return 0; 292 default: 293 assert(!"not reached"); 294 return inst->mlen; 295 } 296} 297 298bool 299src_reg::equals(src_reg *r) 300{ 301 return (file == r->file && 302 reg == r->reg && 303 reg_offset == r->reg_offset && 304 type == r->type && 305 negate == r->negate && 306 abs == r->abs && 307 swizzle == r->swizzle && 308 !reladdr && !r->reladdr && 309 memcmp(&fixed_hw_reg, &r->fixed_hw_reg, 310 sizeof(fixed_hw_reg)) == 0 && 311 imm.u == r->imm.u); 312} 313 314/** 315 * Must be called after calculate_live_intervales() to remove unused 316 * writes to registers -- register allocation will fail otherwise 317 * because something deffed but not used won't be considered to 318 * interfere with other regs. 319 */ 320bool 321vec4_visitor::dead_code_eliminate() 322{ 323 bool progress = false; 324 int pc = 0; 325 326 calculate_live_intervals(); 327 328 foreach_list_safe(node, &this->instructions) { 329 vec4_instruction *inst = (vec4_instruction *)node; 330 331 if (inst->dst.file == GRF && !inst->has_side_effects()) { 332 assert(this->virtual_grf_end[inst->dst.reg] >= pc); 333 if (this->virtual_grf_end[inst->dst.reg] == pc) { 334 /* Don't dead code eliminate instructions that write to the 335 * accumulator as a side-effect. Instead just set the destination 336 * to the null register to free it. 337 */ 338 switch (inst->opcode) { 339 case BRW_OPCODE_ADDC: 340 case BRW_OPCODE_SUBB: 341 case BRW_OPCODE_MACH: 342 inst->dst = dst_reg(retype(brw_null_reg(), inst->dst.type)); 343 break; 344 default: 345 inst->remove(); 346 break; 347 } 348 progress = true; 349 } 350 } 351 352 pc++; 353 } 354 355 if (progress) 356 invalidate_live_intervals(); 357 358 return progress; 359} 360 361void 362vec4_visitor::split_uniform_registers() 363{ 364 /* Prior to this, uniforms have been in an array sized according to 365 * the number of vector uniforms present, sparsely filled (so an 366 * aggregate results in reg indices being skipped over). Now we're 367 * going to cut those aggregates up so each .reg index is one 368 * vector. The goal is to make elimination of unused uniform 369 * components easier later. 370 */ 371 foreach_list(node, &this->instructions) { 372 vec4_instruction *inst = (vec4_instruction *)node; 373 374 for (int i = 0 ; i < 3; i++) { 375 if (inst->src[i].file != UNIFORM) 376 continue; 377 378 assert(!inst->src[i].reladdr); 379 380 inst->src[i].reg += inst->src[i].reg_offset; 381 inst->src[i].reg_offset = 0; 382 } 383 } 384 385 /* Update that everything is now vector-sized. */ 386 for (int i = 0; i < this->uniforms; i++) { 387 this->uniform_size[i] = 1; 388 } 389} 390 391void 392vec4_visitor::pack_uniform_registers() 393{ 394 bool uniform_used[this->uniforms]; 395 int new_loc[this->uniforms]; 396 int new_chan[this->uniforms]; 397 398 memset(uniform_used, 0, sizeof(uniform_used)); 399 memset(new_loc, 0, sizeof(new_loc)); 400 memset(new_chan, 0, sizeof(new_chan)); 401 402 /* Find which uniform vectors are actually used by the program. We 403 * expect unused vector elements when we've moved array access out 404 * to pull constants, and from some GLSL code generators like wine. 405 */ 406 foreach_list(node, &this->instructions) { 407 vec4_instruction *inst = (vec4_instruction *)node; 408 409 for (int i = 0 ; i < 3; i++) { 410 if (inst->src[i].file != UNIFORM) 411 continue; 412 413 uniform_used[inst->src[i].reg] = true; 414 } 415 } 416 417 int new_uniform_count = 0; 418 419 /* Now, figure out a packing of the live uniform vectors into our 420 * push constants. 421 */ 422 for (int src = 0; src < uniforms; src++) { 423 int size = this->uniform_vector_size[src]; 424 425 if (!uniform_used[src]) { 426 this->uniform_vector_size[src] = 0; 427 continue; 428 } 429 430 int dst; 431 /* Find the lowest place we can slot this uniform in. */ 432 for (dst = 0; dst < src; dst++) { 433 if (this->uniform_vector_size[dst] + size <= 4) 434 break; 435 } 436 437 if (src == dst) { 438 new_loc[src] = dst; 439 new_chan[src] = 0; 440 } else { 441 new_loc[src] = dst; 442 new_chan[src] = this->uniform_vector_size[dst]; 443 444 /* Move the references to the data */ 445 for (int j = 0; j < size; j++) { 446 stage_prog_data->param[dst * 4 + new_chan[src] + j] = 447 stage_prog_data->param[src * 4 + j]; 448 } 449 450 this->uniform_vector_size[dst] += size; 451 this->uniform_vector_size[src] = 0; 452 } 453 454 new_uniform_count = MAX2(new_uniform_count, dst + 1); 455 } 456 457 this->uniforms = new_uniform_count; 458 459 /* Now, update the instructions for our repacked uniforms. */ 460 foreach_list(node, &this->instructions) { 461 vec4_instruction *inst = (vec4_instruction *)node; 462 463 for (int i = 0 ; i < 3; i++) { 464 int src = inst->src[i].reg; 465 466 if (inst->src[i].file != UNIFORM) 467 continue; 468 469 inst->src[i].reg = new_loc[src]; 470 471 int sx = BRW_GET_SWZ(inst->src[i].swizzle, 0) + new_chan[src]; 472 int sy = BRW_GET_SWZ(inst->src[i].swizzle, 1) + new_chan[src]; 473 int sz = BRW_GET_SWZ(inst->src[i].swizzle, 2) + new_chan[src]; 474 int sw = BRW_GET_SWZ(inst->src[i].swizzle, 3) + new_chan[src]; 475 inst->src[i].swizzle = BRW_SWIZZLE4(sx, sy, sz, sw); 476 } 477 } 478} 479 480bool 481src_reg::is_zero() const 482{ 483 if (file != IMM) 484 return false; 485 486 if (type == BRW_REGISTER_TYPE_F) { 487 return imm.f == 0.0; 488 } else { 489 return imm.i == 0; 490 } 491} 492 493bool 494src_reg::is_one() const 495{ 496 if (file != IMM) 497 return false; 498 499 if (type == BRW_REGISTER_TYPE_F) { 500 return imm.f == 1.0; 501 } else { 502 return imm.i == 1; 503 } 504} 505 506/** 507 * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a). 508 * 509 * While GLSL IR also performs this optimization, we end up with it in 510 * our instruction stream for a couple of reasons. One is that we 511 * sometimes generate silly instructions, for example in array access 512 * where we'll generate "ADD offset, index, base" even if base is 0. 513 * The other is that GLSL IR's constant propagation doesn't track the 514 * components of aggregates, so some VS patterns (initialize matrix to 515 * 0, accumulate in vertex blending factors) end up breaking down to 516 * instructions involving 0. 517 */ 518bool 519vec4_visitor::opt_algebraic() 520{ 521 bool progress = false; 522 523 foreach_list(node, &this->instructions) { 524 vec4_instruction *inst = (vec4_instruction *)node; 525 526 switch (inst->opcode) { 527 case BRW_OPCODE_ADD: 528 if (inst->src[1].is_zero()) { 529 inst->opcode = BRW_OPCODE_MOV; 530 inst->src[1] = src_reg(); 531 progress = true; 532 } 533 break; 534 535 case BRW_OPCODE_MUL: 536 if (inst->src[1].is_zero()) { 537 inst->opcode = BRW_OPCODE_MOV; 538 switch (inst->src[0].type) { 539 case BRW_REGISTER_TYPE_F: 540 inst->src[0] = src_reg(0.0f); 541 break; 542 case BRW_REGISTER_TYPE_D: 543 inst->src[0] = src_reg(0); 544 break; 545 case BRW_REGISTER_TYPE_UD: 546 inst->src[0] = src_reg(0u); 547 break; 548 default: 549 assert(!"not reached"); 550 inst->src[0] = src_reg(0.0f); 551 break; 552 } 553 inst->src[1] = src_reg(); 554 progress = true; 555 } else if (inst->src[1].is_one()) { 556 inst->opcode = BRW_OPCODE_MOV; 557 inst->src[1] = src_reg(); 558 progress = true; 559 } 560 break; 561 default: 562 break; 563 } 564 } 565 566 if (progress) 567 invalidate_live_intervals(); 568 569 return progress; 570} 571 572/** 573 * Only a limited number of hardware registers may be used for push 574 * constants, so this turns access to the overflowed constants into 575 * pull constants. 576 */ 577void 578vec4_visitor::move_push_constants_to_pull_constants() 579{ 580 int pull_constant_loc[this->uniforms]; 581 582 /* Only allow 32 registers (256 uniform components) as push constants, 583 * which is the limit on gen6. 584 */ 585 int max_uniform_components = 32 * 8; 586 if (this->uniforms * 4 <= max_uniform_components) 587 return; 588 589 /* Make some sort of choice as to which uniforms get sent to pull 590 * constants. We could potentially do something clever here like 591 * look for the most infrequently used uniform vec4s, but leave 592 * that for later. 593 */ 594 for (int i = 0; i < this->uniforms * 4; i += 4) { 595 pull_constant_loc[i / 4] = -1; 596 597 if (i >= max_uniform_components) { 598 const float **values = &stage_prog_data->param[i]; 599 600 /* Try to find an existing copy of this uniform in the pull 601 * constants if it was part of an array access already. 602 */ 603 for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j += 4) { 604 int matches; 605 606 for (matches = 0; matches < 4; matches++) { 607 if (stage_prog_data->pull_param[j + matches] != values[matches]) 608 break; 609 } 610 611 if (matches == 4) { 612 pull_constant_loc[i / 4] = j / 4; 613 break; 614 } 615 } 616 617 if (pull_constant_loc[i / 4] == -1) { 618 assert(stage_prog_data->nr_pull_params % 4 == 0); 619 pull_constant_loc[i / 4] = stage_prog_data->nr_pull_params / 4; 620 621 for (int j = 0; j < 4; j++) { 622 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] = 623 values[j]; 624 } 625 } 626 } 627 } 628 629 /* Now actually rewrite usage of the things we've moved to pull 630 * constants. 631 */ 632 foreach_list_safe(node, &this->instructions) { 633 vec4_instruction *inst = (vec4_instruction *)node; 634 635 for (int i = 0 ; i < 3; i++) { 636 if (inst->src[i].file != UNIFORM || 637 pull_constant_loc[inst->src[i].reg] == -1) 638 continue; 639 640 int uniform = inst->src[i].reg; 641 642 dst_reg temp = dst_reg(this, glsl_type::vec4_type); 643 644 emit_pull_constant_load(inst, temp, inst->src[i], 645 pull_constant_loc[uniform]); 646 647 inst->src[i].file = temp.file; 648 inst->src[i].reg = temp.reg; 649 inst->src[i].reg_offset = temp.reg_offset; 650 inst->src[i].reladdr = NULL; 651 } 652 } 653 654 /* Repack push constants to remove the now-unused ones. */ 655 pack_uniform_registers(); 656} 657 658/** 659 * Sets the dependency control fields on instructions after register 660 * allocation and before the generator is run. 661 * 662 * When you have a sequence of instructions like: 663 * 664 * DP4 temp.x vertex uniform[0] 665 * DP4 temp.y vertex uniform[0] 666 * DP4 temp.z vertex uniform[0] 667 * DP4 temp.w vertex uniform[0] 668 * 669 * The hardware doesn't know that it can actually run the later instructions 670 * while the previous ones are in flight, producing stalls. However, we have 671 * manual fields we can set in the instructions that let it do so. 672 */ 673void 674vec4_visitor::opt_set_dependency_control() 675{ 676 vec4_instruction *last_grf_write[BRW_MAX_GRF]; 677 uint8_t grf_channels_written[BRW_MAX_GRF]; 678 vec4_instruction *last_mrf_write[BRW_MAX_GRF]; 679 uint8_t mrf_channels_written[BRW_MAX_GRF]; 680 681 cfg_t cfg(&instructions); 682 683 assert(prog_data->total_grf || 684 !"Must be called after register allocation"); 685 686 for (int i = 0; i < cfg.num_blocks; i++) { 687 bblock_t *bblock = cfg.blocks[i]; 688 vec4_instruction *inst; 689 690 memset(last_grf_write, 0, sizeof(last_grf_write)); 691 memset(last_mrf_write, 0, sizeof(last_mrf_write)); 692 693 for (inst = (vec4_instruction *)bblock->start; 694 inst != (vec4_instruction *)bblock->end->next; 695 inst = (vec4_instruction *)inst->next) { 696 /* If we read from a register that we were doing dependency control 697 * on, don't do dependency control across the read. 698 */ 699 for (int i = 0; i < 3; i++) { 700 int reg = inst->src[i].reg + inst->src[i].reg_offset; 701 if (inst->src[i].file == GRF) { 702 last_grf_write[reg] = NULL; 703 } else if (inst->src[i].file == HW_REG) { 704 memset(last_grf_write, 0, sizeof(last_grf_write)); 705 break; 706 } 707 assert(inst->src[i].file != MRF); 708 } 709 710 /* In the presence of send messages, totally interrupt dependency 711 * control. They're long enough that the chance of dependency 712 * control around them just doesn't matter. 713 */ 714 if (inst->mlen) { 715 memset(last_grf_write, 0, sizeof(last_grf_write)); 716 memset(last_mrf_write, 0, sizeof(last_mrf_write)); 717 continue; 718 } 719 720 /* It looks like setting dependency control on a predicated 721 * instruction hangs the GPU. 722 */ 723 if (inst->predicate) { 724 memset(last_grf_write, 0, sizeof(last_grf_write)); 725 memset(last_mrf_write, 0, sizeof(last_mrf_write)); 726 continue; 727 } 728 729 /* Now, see if we can do dependency control for this instruction 730 * against a previous one writing to its destination. 731 */ 732 int reg = inst->dst.reg + inst->dst.reg_offset; 733 if (inst->dst.file == GRF) { 734 if (last_grf_write[reg] && 735 !(inst->dst.writemask & grf_channels_written[reg])) { 736 last_grf_write[reg]->no_dd_clear = true; 737 inst->no_dd_check = true; 738 } else { 739 grf_channels_written[reg] = 0; 740 } 741 742 last_grf_write[reg] = inst; 743 grf_channels_written[reg] |= inst->dst.writemask; 744 } else if (inst->dst.file == MRF) { 745 if (last_mrf_write[reg] && 746 !(inst->dst.writemask & mrf_channels_written[reg])) { 747 last_mrf_write[reg]->no_dd_clear = true; 748 inst->no_dd_check = true; 749 } else { 750 mrf_channels_written[reg] = 0; 751 } 752 753 last_mrf_write[reg] = inst; 754 mrf_channels_written[reg] |= inst->dst.writemask; 755 } else if (inst->dst.reg == HW_REG) { 756 if (inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) 757 memset(last_grf_write, 0, sizeof(last_grf_write)); 758 if (inst->dst.fixed_hw_reg.file == BRW_MESSAGE_REGISTER_FILE) 759 memset(last_mrf_write, 0, sizeof(last_mrf_write)); 760 } 761 } 762 } 763} 764 765bool 766vec4_instruction::can_reswizzle_dst(int dst_writemask, 767 int swizzle, 768 int swizzle_mask) 769{ 770 /* If this instruction sets anything not referenced by swizzle, then we'd 771 * totally break it when we reswizzle. 772 */ 773 if (dst.writemask & ~swizzle_mask) 774 return false; 775 776 switch (opcode) { 777 case BRW_OPCODE_DP4: 778 case BRW_OPCODE_DP3: 779 case BRW_OPCODE_DP2: 780 return true; 781 default: 782 /* Check if there happens to be no reswizzling required. */ 783 for (int c = 0; c < 4; c++) { 784 int bit = 1 << BRW_GET_SWZ(swizzle, c); 785 /* Skip components of the swizzle not used by the dst. */ 786 if (!(dst_writemask & (1 << c))) 787 continue; 788 789 /* We don't do the reswizzling yet, so just sanity check that we 790 * don't have to. 791 */ 792 if (bit != (1 << c)) 793 return false; 794 } 795 return true; 796 } 797} 798 799/** 800 * For any channels in the swizzle's source that were populated by this 801 * instruction, rewrite the instruction to put the appropriate result directly 802 * in those channels. 803 * 804 * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x 805 */ 806void 807vec4_instruction::reswizzle_dst(int dst_writemask, int swizzle) 808{ 809 int new_writemask = 0; 810 811 switch (opcode) { 812 case BRW_OPCODE_DP4: 813 case BRW_OPCODE_DP3: 814 case BRW_OPCODE_DP2: 815 for (int c = 0; c < 4; c++) { 816 int bit = 1 << BRW_GET_SWZ(swizzle, c); 817 /* Skip components of the swizzle not used by the dst. */ 818 if (!(dst_writemask & (1 << c))) 819 continue; 820 /* If we were populating this component, then populate the 821 * corresponding channel of the new dst. 822 */ 823 if (dst.writemask & bit) 824 new_writemask |= (1 << c); 825 } 826 dst.writemask = new_writemask; 827 break; 828 default: 829 for (int c = 0; c < 4; c++) { 830 /* Skip components of the swizzle not used by the dst. */ 831 if (!(dst_writemask & (1 << c))) 832 continue; 833 834 /* We don't do the reswizzling yet, so just sanity check that we 835 * don't have to. 836 */ 837 assert((1 << BRW_GET_SWZ(swizzle, c)) == (1 << c)); 838 } 839 break; 840 } 841} 842 843/* 844 * Tries to reduce extra MOV instructions by taking temporary GRFs that get 845 * just written and then MOVed into another reg and making the original write 846 * of the GRF write directly to the final destination instead. 847 */ 848bool 849vec4_visitor::opt_register_coalesce() 850{ 851 bool progress = false; 852 int next_ip = 0; 853 854 calculate_live_intervals(); 855 856 foreach_list_safe(node, &this->instructions) { 857 vec4_instruction *inst = (vec4_instruction *)node; 858 859 int ip = next_ip; 860 next_ip++; 861 862 if (inst->opcode != BRW_OPCODE_MOV || 863 (inst->dst.file != GRF && inst->dst.file != MRF) || 864 inst->predicate || 865 inst->src[0].file != GRF || 866 inst->dst.type != inst->src[0].type || 867 inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr) 868 continue; 869 870 bool to_mrf = (inst->dst.file == MRF); 871 872 /* Can't coalesce this GRF if someone else was going to 873 * read it later. 874 */ 875 if (this->virtual_grf_end[inst->src[0].reg] > ip) 876 continue; 877 878 /* We need to check interference with the final destination between this 879 * instruction and the earliest instruction involved in writing the GRF 880 * we're eliminating. To do that, keep track of which of our source 881 * channels we've seen initialized. 882 */ 883 bool chans_needed[4] = {false, false, false, false}; 884 int chans_remaining = 0; 885 int swizzle_mask = 0; 886 for (int i = 0; i < 4; i++) { 887 int chan = BRW_GET_SWZ(inst->src[0].swizzle, i); 888 889 if (!(inst->dst.writemask & (1 << i))) 890 continue; 891 892 swizzle_mask |= (1 << chan); 893 894 if (!chans_needed[chan]) { 895 chans_needed[chan] = true; 896 chans_remaining++; 897 } 898 } 899 900 /* Now walk up the instruction stream trying to see if we can rewrite 901 * everything writing to the temporary to write into the destination 902 * instead. 903 */ 904 vec4_instruction *scan_inst; 905 for (scan_inst = (vec4_instruction *)inst->prev; 906 scan_inst->prev != NULL; 907 scan_inst = (vec4_instruction *)scan_inst->prev) { 908 if (scan_inst->dst.file == GRF && 909 scan_inst->dst.reg == inst->src[0].reg && 910 scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 911 /* Found something writing to the reg we want to coalesce away. */ 912 if (to_mrf) { 913 /* SEND instructions can't have MRF as a destination. */ 914 if (scan_inst->mlen) 915 break; 916 917 if (brw->gen == 6) { 918 /* gen6 math instructions must have the destination be 919 * GRF, so no compute-to-MRF for them. 920 */ 921 if (scan_inst->is_math()) { 922 break; 923 } 924 } 925 } 926 927 /* If we can't handle the swizzle, bail. */ 928 if (!scan_inst->can_reswizzle_dst(inst->dst.writemask, 929 inst->src[0].swizzle, 930 swizzle_mask)) { 931 break; 932 } 933 934 /* Mark which channels we found unconditional writes for. */ 935 if (!scan_inst->predicate) { 936 for (int i = 0; i < 4; i++) { 937 if (scan_inst->dst.writemask & (1 << i) && 938 chans_needed[i]) { 939 chans_needed[i] = false; 940 chans_remaining--; 941 } 942 } 943 } 944 945 if (chans_remaining == 0) 946 break; 947 } 948 949 /* We don't handle flow control here. Most computation of values 950 * that could be coalesced happens just before their use. 951 */ 952 if (scan_inst->opcode == BRW_OPCODE_DO || 953 scan_inst->opcode == BRW_OPCODE_WHILE || 954 scan_inst->opcode == BRW_OPCODE_ELSE || 955 scan_inst->opcode == BRW_OPCODE_ENDIF) { 956 break; 957 } 958 959 /* You can't read from an MRF, so if someone else reads our MRF's 960 * source GRF that we wanted to rewrite, that stops us. If it's a 961 * GRF we're trying to coalesce to, we don't actually handle 962 * rewriting sources so bail in that case as well. 963 */ 964 bool interfered = false; 965 for (int i = 0; i < 3; i++) { 966 if (scan_inst->src[i].file == GRF && 967 scan_inst->src[i].reg == inst->src[0].reg && 968 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { 969 interfered = true; 970 } 971 } 972 if (interfered) 973 break; 974 975 /* If somebody else writes our destination here, we can't coalesce 976 * before that. 977 */ 978 if (scan_inst->dst.file == inst->dst.file && 979 scan_inst->dst.reg == inst->dst.reg) { 980 break; 981 } 982 983 /* Check for reads of the register we're trying to coalesce into. We 984 * can't go rewriting instructions above that to put some other value 985 * in the register instead. 986 */ 987 if (to_mrf && scan_inst->mlen > 0) { 988 if (inst->dst.reg >= scan_inst->base_mrf && 989 inst->dst.reg < scan_inst->base_mrf + scan_inst->mlen) { 990 break; 991 } 992 } else { 993 for (int i = 0; i < 3; i++) { 994 if (scan_inst->src[i].file == inst->dst.file && 995 scan_inst->src[i].reg == inst->dst.reg && 996 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { 997 interfered = true; 998 } 999 } 1000 if (interfered) 1001 break; 1002 } 1003 } 1004 1005 if (chans_remaining == 0) { 1006 /* If we've made it here, we have an MOV we want to coalesce out, and 1007 * a scan_inst pointing to the earliest instruction involved in 1008 * computing the value. Now go rewrite the instruction stream 1009 * between the two. 1010 */ 1011 1012 while (scan_inst != inst) { 1013 if (scan_inst->dst.file == GRF && 1014 scan_inst->dst.reg == inst->src[0].reg && 1015 scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 1016 scan_inst->reswizzle_dst(inst->dst.writemask, 1017 inst->src[0].swizzle); 1018 scan_inst->dst.file = inst->dst.file; 1019 scan_inst->dst.reg = inst->dst.reg; 1020 scan_inst->dst.reg_offset = inst->dst.reg_offset; 1021 scan_inst->saturate |= inst->saturate; 1022 } 1023 scan_inst = (vec4_instruction *)scan_inst->next; 1024 } 1025 inst->remove(); 1026 progress = true; 1027 } 1028 } 1029 1030 if (progress) 1031 invalidate_live_intervals(); 1032 1033 return progress; 1034} 1035 1036/** 1037 * Splits virtual GRFs requesting more than one contiguous physical register. 1038 * 1039 * We initially create large virtual GRFs for temporary structures, arrays, 1040 * and matrices, so that the dereference visitor functions can add reg_offsets 1041 * to work their way down to the actual member being accessed. But when it 1042 * comes to optimization, we'd like to treat each register as individual 1043 * storage if possible. 1044 * 1045 * So far, the only thing that might prevent splitting is a send message from 1046 * a GRF on IVB. 1047 */ 1048void 1049vec4_visitor::split_virtual_grfs() 1050{ 1051 int num_vars = this->virtual_grf_count; 1052 int new_virtual_grf[num_vars]; 1053 bool split_grf[num_vars]; 1054 1055 memset(new_virtual_grf, 0, sizeof(new_virtual_grf)); 1056 1057 /* Try to split anything > 0 sized. */ 1058 for (int i = 0; i < num_vars; i++) { 1059 split_grf[i] = this->virtual_grf_sizes[i] != 1; 1060 } 1061 1062 /* Check that the instructions are compatible with the registers we're trying 1063 * to split. 1064 */ 1065 foreach_list(node, &this->instructions) { 1066 vec4_instruction *inst = (vec4_instruction *)node; 1067 1068 /* If there's a SEND message loading from a GRF on gen7+, it needs to be 1069 * contiguous. 1070 */ 1071 if (inst->is_send_from_grf()) { 1072 for (int i = 0; i < 3; i++) { 1073 if (inst->src[i].file == GRF) { 1074 split_grf[inst->src[i].reg] = false; 1075 } 1076 } 1077 } 1078 } 1079 1080 /* Allocate new space for split regs. Note that the virtual 1081 * numbers will be contiguous. 1082 */ 1083 for (int i = 0; i < num_vars; i++) { 1084 if (!split_grf[i]) 1085 continue; 1086 1087 new_virtual_grf[i] = virtual_grf_alloc(1); 1088 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) { 1089 int reg = virtual_grf_alloc(1); 1090 assert(reg == new_virtual_grf[i] + j - 1); 1091 (void) reg; 1092 } 1093 this->virtual_grf_sizes[i] = 1; 1094 } 1095 1096 foreach_list(node, &this->instructions) { 1097 vec4_instruction *inst = (vec4_instruction *)node; 1098 1099 if (inst->dst.file == GRF && split_grf[inst->dst.reg] && 1100 inst->dst.reg_offset != 0) { 1101 inst->dst.reg = (new_virtual_grf[inst->dst.reg] + 1102 inst->dst.reg_offset - 1); 1103 inst->dst.reg_offset = 0; 1104 } 1105 for (int i = 0; i < 3; i++) { 1106 if (inst->src[i].file == GRF && split_grf[inst->src[i].reg] && 1107 inst->src[i].reg_offset != 0) { 1108 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] + 1109 inst->src[i].reg_offset - 1); 1110 inst->src[i].reg_offset = 0; 1111 } 1112 } 1113 } 1114 invalidate_live_intervals(); 1115} 1116 1117void 1118vec4_visitor::dump_instruction(backend_instruction *be_inst) 1119{ 1120 vec4_instruction *inst = (vec4_instruction *)be_inst; 1121 1122 printf("%s", brw_instruction_name(inst->opcode)); 1123 if (inst->conditional_mod) { 1124 printf("%s", conditional_modifier[inst->conditional_mod]); 1125 } 1126 printf(" "); 1127 1128 switch (inst->dst.file) { 1129 case GRF: 1130 printf("vgrf%d.%d", inst->dst.reg, inst->dst.reg_offset); 1131 break; 1132 case MRF: 1133 printf("m%d", inst->dst.reg); 1134 break; 1135 case HW_REG: 1136 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) { 1137 switch (inst->dst.fixed_hw_reg.nr) { 1138 case BRW_ARF_NULL: 1139 printf("null"); 1140 break; 1141 case BRW_ARF_ADDRESS: 1142 printf("a0.%d", inst->dst.fixed_hw_reg.subnr); 1143 break; 1144 case BRW_ARF_ACCUMULATOR: 1145 printf("acc%d", inst->dst.fixed_hw_reg.subnr); 1146 break; 1147 case BRW_ARF_FLAG: 1148 printf("f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf, 1149 inst->dst.fixed_hw_reg.subnr); 1150 break; 1151 default: 1152 printf("arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf, 1153 inst->dst.fixed_hw_reg.subnr); 1154 break; 1155 } 1156 } else { 1157 printf("hw_reg%d", inst->dst.fixed_hw_reg.nr); 1158 } 1159 if (inst->dst.fixed_hw_reg.subnr) 1160 printf("+%d", inst->dst.fixed_hw_reg.subnr); 1161 break; 1162 case BAD_FILE: 1163 printf("(null)"); 1164 break; 1165 default: 1166 printf("???"); 1167 break; 1168 } 1169 if (inst->dst.writemask != WRITEMASK_XYZW) { 1170 printf("."); 1171 if (inst->dst.writemask & 1) 1172 printf("x"); 1173 if (inst->dst.writemask & 2) 1174 printf("y"); 1175 if (inst->dst.writemask & 4) 1176 printf("z"); 1177 if (inst->dst.writemask & 8) 1178 printf("w"); 1179 } 1180 printf(":%s, ", brw_reg_type_letters(inst->dst.type)); 1181 1182 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) { 1183 if (inst->src[i].negate) 1184 printf("-"); 1185 if (inst->src[i].abs) 1186 printf("|"); 1187 switch (inst->src[i].file) { 1188 case GRF: 1189 printf("vgrf%d", inst->src[i].reg); 1190 break; 1191 case ATTR: 1192 printf("attr%d", inst->src[i].reg); 1193 break; 1194 case UNIFORM: 1195 printf("u%d", inst->src[i].reg); 1196 break; 1197 case IMM: 1198 switch (inst->src[i].type) { 1199 case BRW_REGISTER_TYPE_F: 1200 printf("%fF", inst->src[i].imm.f); 1201 break; 1202 case BRW_REGISTER_TYPE_D: 1203 printf("%dD", inst->src[i].imm.i); 1204 break; 1205 case BRW_REGISTER_TYPE_UD: 1206 printf("%uU", inst->src[i].imm.u); 1207 break; 1208 default: 1209 printf("???"); 1210 break; 1211 } 1212 break; 1213 case HW_REG: 1214 if (inst->src[i].fixed_hw_reg.negate) 1215 printf("-"); 1216 if (inst->src[i].fixed_hw_reg.abs) 1217 printf("|"); 1218 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) { 1219 switch (inst->src[i].fixed_hw_reg.nr) { 1220 case BRW_ARF_NULL: 1221 printf("null"); 1222 break; 1223 case BRW_ARF_ADDRESS: 1224 printf("a0.%d", inst->src[i].fixed_hw_reg.subnr); 1225 break; 1226 case BRW_ARF_ACCUMULATOR: 1227 printf("acc%d", inst->src[i].fixed_hw_reg.subnr); 1228 break; 1229 case BRW_ARF_FLAG: 1230 printf("f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf, 1231 inst->src[i].fixed_hw_reg.subnr); 1232 break; 1233 default: 1234 printf("arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf, 1235 inst->src[i].fixed_hw_reg.subnr); 1236 break; 1237 } 1238 } else { 1239 printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr); 1240 } 1241 if (inst->src[i].fixed_hw_reg.subnr) 1242 printf("+%d", inst->src[i].fixed_hw_reg.subnr); 1243 if (inst->src[i].fixed_hw_reg.abs) 1244 printf("|"); 1245 break; 1246 case BAD_FILE: 1247 printf("(null)"); 1248 break; 1249 default: 1250 printf("???"); 1251 break; 1252 } 1253 1254 if (virtual_grf_sizes[inst->src[i].reg] != 1) 1255 printf(".%d", inst->src[i].reg_offset); 1256 1257 if (inst->src[i].file != IMM) { 1258 static const char *chans[4] = {"x", "y", "z", "w"}; 1259 printf("."); 1260 for (int c = 0; c < 4; c++) { 1261 printf("%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]); 1262 } 1263 } 1264 1265 if (inst->src[i].abs) 1266 printf("|"); 1267 1268 if (inst->src[i].file != IMM) { 1269 printf(":%s", reg_encoding[inst->src[i].type]); 1270 } 1271 1272 if (i < 2 && inst->src[i + 1].file != BAD_FILE) 1273 printf(", "); 1274 } 1275 1276 printf("\n"); 1277} 1278 1279 1280static inline struct brw_reg 1281attribute_to_hw_reg(int attr, bool interleaved) 1282{ 1283 if (interleaved) 1284 return stride(brw_vec4_grf(attr / 2, (attr % 2) * 4), 0, 4, 1); 1285 else 1286 return brw_vec8_grf(attr, 0); 1287} 1288 1289 1290/** 1291 * Replace each register of type ATTR in this->instructions with a reference 1292 * to a fixed HW register. 1293 * 1294 * If interleaved is true, then each attribute takes up half a register, with 1295 * register N containing attribute 2*N in its first half and attribute 2*N+1 1296 * in its second half (this corresponds to the payload setup used by geometry 1297 * shaders in "single" or "dual instanced" dispatch mode). If interleaved is 1298 * false, then each attribute takes up a whole register, with register N 1299 * containing attribute N (this corresponds to the payload setup used by 1300 * vertex shaders, and by geometry shaders in "dual object" dispatch mode). 1301 */ 1302void 1303vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map, 1304 bool interleaved) 1305{ 1306 foreach_list(node, &this->instructions) { 1307 vec4_instruction *inst = (vec4_instruction *)node; 1308 1309 /* We have to support ATTR as a destination for GL_FIXED fixup. */ 1310 if (inst->dst.file == ATTR) { 1311 int grf = attribute_map[inst->dst.reg + inst->dst.reg_offset]; 1312 1313 /* All attributes used in the shader need to have been assigned a 1314 * hardware register by the caller 1315 */ 1316 assert(grf != 0); 1317 1318 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved); 1319 reg.type = inst->dst.type; 1320 reg.dw1.bits.writemask = inst->dst.writemask; 1321 1322 inst->dst.file = HW_REG; 1323 inst->dst.fixed_hw_reg = reg; 1324 } 1325 1326 for (int i = 0; i < 3; i++) { 1327 if (inst->src[i].file != ATTR) 1328 continue; 1329 1330 int grf = attribute_map[inst->src[i].reg + inst->src[i].reg_offset]; 1331 1332 /* All attributes used in the shader need to have been assigned a 1333 * hardware register by the caller 1334 */ 1335 assert(grf != 0); 1336 1337 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved); 1338 reg.dw1.bits.swizzle = inst->src[i].swizzle; 1339 reg.type = inst->src[i].type; 1340 if (inst->src[i].abs) 1341 reg = brw_abs(reg); 1342 if (inst->src[i].negate) 1343 reg = negate(reg); 1344 1345 inst->src[i].file = HW_REG; 1346 inst->src[i].fixed_hw_reg = reg; 1347 } 1348 } 1349} 1350 1351int 1352vec4_vs_visitor::setup_attributes(int payload_reg) 1353{ 1354 int nr_attributes; 1355 int attribute_map[VERT_ATTRIB_MAX + 1]; 1356 memset(attribute_map, 0, sizeof(attribute_map)); 1357 1358 nr_attributes = 0; 1359 for (int i = 0; i < VERT_ATTRIB_MAX; i++) { 1360 if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) { 1361 attribute_map[i] = payload_reg + nr_attributes; 1362 nr_attributes++; 1363 } 1364 } 1365 1366 /* VertexID is stored by the VF as the last vertex element, but we 1367 * don't represent it with a flag in inputs_read, so we call it 1368 * VERT_ATTRIB_MAX. 1369 */ 1370 if (vs_prog_data->uses_vertexid) { 1371 attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes; 1372 nr_attributes++; 1373 } 1374 1375 lower_attributes_to_hw_regs(attribute_map, false /* interleaved */); 1376 1377 /* The BSpec says we always have to read at least one thing from 1378 * the VF, and it appears that the hardware wedges otherwise. 1379 */ 1380 if (nr_attributes == 0) 1381 nr_attributes = 1; 1382 1383 prog_data->urb_read_length = (nr_attributes + 1) / 2; 1384 1385 unsigned vue_entries = 1386 MAX2(nr_attributes, prog_data->vue_map.num_slots); 1387 1388 if (brw->gen == 6) 1389 prog_data->urb_entry_size = ALIGN(vue_entries, 8) / 8; 1390 else 1391 prog_data->urb_entry_size = ALIGN(vue_entries, 4) / 4; 1392 1393 return payload_reg + nr_attributes; 1394} 1395 1396int 1397vec4_visitor::setup_uniforms(int reg) 1398{ 1399 prog_data->dispatch_grf_start_reg = reg; 1400 1401 /* The pre-gen6 VS requires that some push constants get loaded no 1402 * matter what, or the GPU would hang. 1403 */ 1404 if (brw->gen < 6 && this->uniforms == 0) { 1405 this->uniform_vector_size[this->uniforms] = 1; 1406 1407 stage_prog_data->param = 1408 reralloc(NULL, stage_prog_data->param, const float *, 4); 1409 for (unsigned int i = 0; i < 4; i++) { 1410 unsigned int slot = this->uniforms * 4 + i; 1411 static float zero = 0.0; 1412 stage_prog_data->param[slot] = &zero; 1413 } 1414 1415 this->uniforms++; 1416 reg++; 1417 } else { 1418 reg += ALIGN(uniforms, 2) / 2; 1419 } 1420 1421 stage_prog_data->nr_params = this->uniforms * 4; 1422 1423 prog_data->curb_read_length = reg - prog_data->dispatch_grf_start_reg; 1424 1425 return reg; 1426} 1427 1428void 1429vec4_vs_visitor::setup_payload(void) 1430{ 1431 int reg = 0; 1432 1433 /* The payload always contains important data in g0, which contains 1434 * the URB handles that are passed on to the URB write at the end 1435 * of the thread. So, we always start push constants at g1. 1436 */ 1437 reg++; 1438 1439 reg = setup_uniforms(reg); 1440 1441 reg = setup_attributes(reg); 1442 1443 this->first_non_payload_grf = reg; 1444} 1445 1446src_reg 1447vec4_visitor::get_timestamp() 1448{ 1449 assert(brw->gen >= 7); 1450 1451 src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE, 1452 BRW_ARF_TIMESTAMP, 1453 0, 1454 BRW_REGISTER_TYPE_UD, 1455 BRW_VERTICAL_STRIDE_0, 1456 BRW_WIDTH_4, 1457 BRW_HORIZONTAL_STRIDE_4, 1458 BRW_SWIZZLE_XYZW, 1459 WRITEMASK_XYZW)); 1460 1461 dst_reg dst = dst_reg(this, glsl_type::uvec4_type); 1462 1463 vec4_instruction *mov = emit(MOV(dst, ts)); 1464 /* We want to read the 3 fields we care about (mostly field 0, but also 2) 1465 * even if it's not enabled in the dispatch. 1466 */ 1467 mov->force_writemask_all = true; 1468 1469 return src_reg(dst); 1470} 1471 1472void 1473vec4_visitor::emit_shader_time_begin() 1474{ 1475 current_annotation = "shader time start"; 1476 shader_start_time = get_timestamp(); 1477} 1478 1479void 1480vec4_visitor::emit_shader_time_end() 1481{ 1482 current_annotation = "shader time end"; 1483 src_reg shader_end_time = get_timestamp(); 1484 1485 1486 /* Check that there weren't any timestamp reset events (assuming these 1487 * were the only two timestamp reads that happened). 1488 */ 1489 src_reg reset_end = shader_end_time; 1490 reset_end.swizzle = BRW_SWIZZLE_ZZZZ; 1491 vec4_instruction *test = emit(AND(dst_null_d(), reset_end, src_reg(1u))); 1492 test->conditional_mod = BRW_CONDITIONAL_Z; 1493 1494 emit(IF(BRW_PREDICATE_NORMAL)); 1495 1496 /* Take the current timestamp and get the delta. */ 1497 shader_start_time.negate = true; 1498 dst_reg diff = dst_reg(this, glsl_type::uint_type); 1499 emit(ADD(diff, shader_start_time, shader_end_time)); 1500 1501 /* If there were no instructions between the two timestamp gets, the diff 1502 * is 2 cycles. Remove that overhead, so I can forget about that when 1503 * trying to determine the time taken for single instructions. 1504 */ 1505 emit(ADD(diff, src_reg(diff), src_reg(-2u))); 1506 1507 emit_shader_time_write(st_base, src_reg(diff)); 1508 emit_shader_time_write(st_written, src_reg(1u)); 1509 emit(BRW_OPCODE_ELSE); 1510 emit_shader_time_write(st_reset, src_reg(1u)); 1511 emit(BRW_OPCODE_ENDIF); 1512} 1513 1514void 1515vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type, 1516 src_reg value) 1517{ 1518 int shader_time_index = 1519 brw_get_shader_time_index(brw, shader_prog, prog, type); 1520 1521 dst_reg dst = 1522 dst_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 2)); 1523 1524 dst_reg offset = dst; 1525 dst_reg time = dst; 1526 time.reg_offset++; 1527 1528 offset.type = BRW_REGISTER_TYPE_UD; 1529 emit(MOV(offset, src_reg(shader_time_index * SHADER_TIME_STRIDE))); 1530 1531 time.type = BRW_REGISTER_TYPE_UD; 1532 emit(MOV(time, src_reg(value))); 1533 1534 emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(dst)); 1535} 1536 1537bool 1538vec4_visitor::run() 1539{ 1540 sanity_param_count = prog->Parameters->NumParameters; 1541 1542 if (INTEL_DEBUG & DEBUG_SHADER_TIME) 1543 emit_shader_time_begin(); 1544 1545 assign_common_binding_table_offsets(0); 1546 1547 emit_prolog(); 1548 1549 /* Generate VS IR for main(). (the visitor only descends into 1550 * functions called "main"). 1551 */ 1552 if (shader) { 1553 visit_instructions(shader->base.ir); 1554 } else { 1555 emit_program_code(); 1556 } 1557 base_ir = NULL; 1558 1559 if (key->userclip_active && !prog->UsesClipDistanceOut) 1560 setup_uniform_clipplane_values(); 1561 1562 emit_thread_end(); 1563 1564 /* Before any optimization, push array accesses out to scratch 1565 * space where we need them to be. This pass may allocate new 1566 * virtual GRFs, so we want to do it early. It also makes sure 1567 * that we have reladdr computations available for CSE, since we'll 1568 * often do repeated subexpressions for those. 1569 */ 1570 if (shader) { 1571 move_grf_array_access_to_scratch(); 1572 move_uniform_array_access_to_pull_constants(); 1573 } else { 1574 /* The ARB_vertex_program frontend emits pull constant loads directly 1575 * rather than using reladdr, so we don't need to walk through all the 1576 * instructions looking for things to move. There isn't anything. 1577 * 1578 * We do still need to split things to vec4 size. 1579 */ 1580 split_uniform_registers(); 1581 } 1582 pack_uniform_registers(); 1583 move_push_constants_to_pull_constants(); 1584 split_virtual_grfs(); 1585 1586 bool progress; 1587 do { 1588 progress = false; 1589 progress = dead_code_eliminate() || progress; 1590 progress = dead_control_flow_eliminate(this) || progress; 1591 progress = opt_copy_propagation() || progress; 1592 progress = opt_algebraic() || progress; 1593 progress = opt_register_coalesce() || progress; 1594 } while (progress); 1595 1596 1597 if (failed) 1598 return false; 1599 1600 setup_payload(); 1601 1602 if (false) { 1603 /* Debug of register spilling: Go spill everything. */ 1604 const int grf_count = virtual_grf_count; 1605 float spill_costs[virtual_grf_count]; 1606 bool no_spill[virtual_grf_count]; 1607 evaluate_spill_costs(spill_costs, no_spill); 1608 for (int i = 0; i < grf_count; i++) { 1609 if (no_spill[i]) 1610 continue; 1611 spill_reg(i); 1612 } 1613 } 1614 1615 while (!reg_allocate()) { 1616 if (failed) 1617 return false; 1618 } 1619 1620 opt_schedule_instructions(); 1621 1622 opt_set_dependency_control(); 1623 1624 /* If any state parameters were appended, then ParameterValues could have 1625 * been realloced, in which case the driver uniform storage set up by 1626 * _mesa_associate_uniform_storage() would point to freed memory. Make 1627 * sure that didn't happen. 1628 */ 1629 assert(sanity_param_count == prog->Parameters->NumParameters); 1630 1631 return !failed; 1632} 1633 1634} /* namespace brw */ 1635 1636extern "C" { 1637 1638/** 1639 * Compile a vertex shader. 1640 * 1641 * Returns the final assembly and the program's size. 1642 */ 1643const unsigned * 1644brw_vs_emit(struct brw_context *brw, 1645 struct gl_shader_program *prog, 1646 struct brw_vs_compile *c, 1647 struct brw_vs_prog_data *prog_data, 1648 void *mem_ctx, 1649 unsigned *final_assembly_size) 1650{ 1651 bool start_busy = false; 1652 float start_time = 0; 1653 1654 if (unlikely(brw->perf_debug)) { 1655 start_busy = (brw->batch.last_bo && 1656 drm_intel_bo_busy(brw->batch.last_bo)); 1657 start_time = get_time(); 1658 } 1659 1660 struct brw_shader *shader = NULL; 1661 if (prog) 1662 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_VERTEX]; 1663 1664 if (unlikely(INTEL_DEBUG & DEBUG_VS)) { 1665 if (prog) { 1666 printf("GLSL IR for native vertex shader %d:\n", prog->Name); 1667 _mesa_print_ir(shader->base.ir, NULL); 1668 printf("\n\n"); 1669 } else { 1670 printf("ARB_vertex_program %d for native vertex shader\n", 1671 c->vp->program.Base.Id); 1672 _mesa_print_program(&c->vp->program.Base); 1673 } 1674 } 1675 1676 vec4_vs_visitor v(brw, c, prog_data, prog, shader, mem_ctx); 1677 if (!v.run()) { 1678 if (prog) { 1679 prog->LinkStatus = false; 1680 ralloc_strcat(&prog->InfoLog, v.fail_msg); 1681 } 1682 1683 _mesa_problem(NULL, "Failed to compile vertex shader: %s\n", 1684 v.fail_msg); 1685 1686 return NULL; 1687 } 1688 1689 const unsigned *assembly = NULL; 1690 if (brw->gen >= 8) { 1691 gen8_vec4_generator g(brw, prog, &c->vp->program.Base, &prog_data->base, 1692 mem_ctx, INTEL_DEBUG & DEBUG_VS); 1693 assembly = g.generate_assembly(&v.instructions, final_assembly_size); 1694 } else { 1695 vec4_generator g(brw, prog, &c->vp->program.Base, &prog_data->base, 1696 mem_ctx, INTEL_DEBUG & DEBUG_VS); 1697 assembly = g.generate_assembly(&v.instructions, final_assembly_size); 1698 } 1699 1700 if (unlikely(brw->perf_debug) && shader) { 1701 if (shader->compiled_once) { 1702 brw_vs_debug_recompile(brw, prog, &c->key); 1703 } 1704 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) { 1705 perf_debug("VS compile took %.03f ms and stalled the GPU\n", 1706 (get_time() - start_time) * 1000); 1707 } 1708 shader->compiled_once = true; 1709 } 1710 1711 return assembly; 1712} 1713 1714 1715void 1716brw_vec4_setup_prog_key_for_precompile(struct gl_context *ctx, 1717 struct brw_vec4_prog_key *key, 1718 GLuint id, struct gl_program *prog) 1719{ 1720 key->program_string_id = id; 1721 key->clamp_vertex_color = ctx->API == API_OPENGL_COMPAT; 1722 1723 unsigned sampler_count = _mesa_fls(prog->SamplersUsed); 1724 for (unsigned i = 0; i < sampler_count; i++) { 1725 if (prog->ShadowSamplers & (1 << i)) { 1726 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */ 1727 key->tex.swizzles[i] = 1728 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE); 1729 } else { 1730 /* Color sampler: assume no swizzling. */ 1731 key->tex.swizzles[i] = SWIZZLE_XYZW; 1732 } 1733 } 1734} 1735 1736} /* extern "C" */ 1737