brw_vec4.cpp revision 1d4f3ca8f0442821c914b758b323e6e5124149a3
1/* 2 * Copyright © 2011 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "brw_vec4.h" 25extern "C" { 26#include "main/macros.h" 27#include "program/prog_parameter.h" 28} 29 30#define MAX_INSTRUCTION (1 << 30) 31 32namespace brw { 33 34bool 35vec4_instruction::is_math() 36{ 37 return (opcode == SHADER_OPCODE_RCP || 38 opcode == SHADER_OPCODE_RSQ || 39 opcode == SHADER_OPCODE_SQRT || 40 opcode == SHADER_OPCODE_EXP2 || 41 opcode == SHADER_OPCODE_LOG2 || 42 opcode == SHADER_OPCODE_SIN || 43 opcode == SHADER_OPCODE_COS || 44 opcode == SHADER_OPCODE_INT_QUOTIENT || 45 opcode == SHADER_OPCODE_INT_REMAINDER || 46 opcode == SHADER_OPCODE_POW); 47} 48/** 49 * Returns how many MRFs an opcode will write over. 50 * 51 * Note that this is not the 0 or 1 implied writes in an actual gen 52 * instruction -- the generate_* functions generate additional MOVs 53 * for setup. 54 */ 55int 56vec4_visitor::implied_mrf_writes(vec4_instruction *inst) 57{ 58 if (inst->mlen == 0) 59 return 0; 60 61 switch (inst->opcode) { 62 case SHADER_OPCODE_RCP: 63 case SHADER_OPCODE_RSQ: 64 case SHADER_OPCODE_SQRT: 65 case SHADER_OPCODE_EXP2: 66 case SHADER_OPCODE_LOG2: 67 case SHADER_OPCODE_SIN: 68 case SHADER_OPCODE_COS: 69 return 1; 70 case SHADER_OPCODE_POW: 71 return 2; 72 case VS_OPCODE_URB_WRITE: 73 return 1; 74 case VS_OPCODE_PULL_CONSTANT_LOAD: 75 return 2; 76 case VS_OPCODE_SCRATCH_READ: 77 return 2; 78 case VS_OPCODE_SCRATCH_WRITE: 79 return 3; 80 default: 81 assert(!"not reached"); 82 return inst->mlen; 83 } 84} 85 86bool 87src_reg::equals(src_reg *r) 88{ 89 return (file == r->file && 90 reg == r->reg && 91 reg_offset == r->reg_offset && 92 type == r->type && 93 negate == r->negate && 94 abs == r->abs && 95 swizzle == r->swizzle && 96 !reladdr && !r->reladdr && 97 memcmp(&fixed_hw_reg, &r->fixed_hw_reg, 98 sizeof(fixed_hw_reg)) == 0 && 99 imm.u == r->imm.u); 100} 101 102void 103vec4_visitor::calculate_live_intervals() 104{ 105 int *def = ralloc_array(mem_ctx, int, virtual_grf_count); 106 int *use = ralloc_array(mem_ctx, int, virtual_grf_count); 107 int loop_depth = 0; 108 int loop_start = 0; 109 110 if (this->live_intervals_valid) 111 return; 112 113 for (int i = 0; i < virtual_grf_count; i++) { 114 def[i] = MAX_INSTRUCTION; 115 use[i] = -1; 116 } 117 118 int ip = 0; 119 foreach_list(node, &this->instructions) { 120 vec4_instruction *inst = (vec4_instruction *)node; 121 122 if (inst->opcode == BRW_OPCODE_DO) { 123 if (loop_depth++ == 0) 124 loop_start = ip; 125 } else if (inst->opcode == BRW_OPCODE_WHILE) { 126 loop_depth--; 127 128 if (loop_depth == 0) { 129 /* Patches up the use of vars marked for being live across 130 * the whole loop. 131 */ 132 for (int i = 0; i < virtual_grf_count; i++) { 133 if (use[i] == loop_start) { 134 use[i] = ip; 135 } 136 } 137 } 138 } else { 139 for (unsigned int i = 0; i < 3; i++) { 140 if (inst->src[i].file == GRF) { 141 int reg = inst->src[i].reg; 142 143 if (!loop_depth) { 144 use[reg] = ip; 145 } else { 146 def[reg] = MIN2(loop_start, def[reg]); 147 use[reg] = loop_start; 148 149 /* Nobody else is going to go smash our start to 150 * later in the loop now, because def[reg] now 151 * points before the bb header. 152 */ 153 } 154 } 155 } 156 if (inst->dst.file == GRF) { 157 int reg = inst->dst.reg; 158 159 if (!loop_depth) { 160 def[reg] = MIN2(def[reg], ip); 161 } else { 162 def[reg] = MIN2(def[reg], loop_start); 163 } 164 } 165 } 166 167 ip++; 168 } 169 170 ralloc_free(this->virtual_grf_def); 171 ralloc_free(this->virtual_grf_use); 172 this->virtual_grf_def = def; 173 this->virtual_grf_use = use; 174 175 this->live_intervals_valid = true; 176} 177 178bool 179vec4_visitor::virtual_grf_interferes(int a, int b) 180{ 181 int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]); 182 int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]); 183 184 /* We can't handle dead register writes here, without iterating 185 * over the whole instruction stream to find every single dead 186 * write to that register to compare to the live interval of the 187 * other register. Just assert that dead_code_eliminate() has been 188 * called. 189 */ 190 assert((this->virtual_grf_use[a] != -1 || 191 this->virtual_grf_def[a] == MAX_INSTRUCTION) && 192 (this->virtual_grf_use[b] != -1 || 193 this->virtual_grf_def[b] == MAX_INSTRUCTION)); 194 195 return start < end; 196} 197 198/** 199 * Must be called after calculate_live_intervales() to remove unused 200 * writes to registers -- register allocation will fail otherwise 201 * because something deffed but not used won't be considered to 202 * interfere with other regs. 203 */ 204bool 205vec4_visitor::dead_code_eliminate() 206{ 207 bool progress = false; 208 int pc = 0; 209 210 calculate_live_intervals(); 211 212 foreach_list_safe(node, &this->instructions) { 213 vec4_instruction *inst = (vec4_instruction *)node; 214 215 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) { 216 inst->remove(); 217 progress = true; 218 } 219 220 pc++; 221 } 222 223 if (progress) 224 live_intervals_valid = false; 225 226 return progress; 227} 228 229void 230vec4_visitor::split_uniform_registers() 231{ 232 /* Prior to this, uniforms have been in an array sized according to 233 * the number of vector uniforms present, sparsely filled (so an 234 * aggregate results in reg indices being skipped over). Now we're 235 * going to cut those aggregates up so each .reg index is one 236 * vector. The goal is to make elimination of unused uniform 237 * components easier later. 238 */ 239 foreach_list(node, &this->instructions) { 240 vec4_instruction *inst = (vec4_instruction *)node; 241 242 for (int i = 0 ; i < 3; i++) { 243 if (inst->src[i].file != UNIFORM) 244 continue; 245 246 assert(!inst->src[i].reladdr); 247 248 inst->src[i].reg += inst->src[i].reg_offset; 249 inst->src[i].reg_offset = 0; 250 } 251 } 252 253 /* Update that everything is now vector-sized. */ 254 for (int i = 0; i < this->uniforms; i++) { 255 this->uniform_size[i] = 1; 256 } 257} 258 259void 260vec4_visitor::pack_uniform_registers() 261{ 262 bool uniform_used[this->uniforms]; 263 int new_loc[this->uniforms]; 264 int new_chan[this->uniforms]; 265 266 memset(uniform_used, 0, sizeof(uniform_used)); 267 memset(new_loc, 0, sizeof(new_loc)); 268 memset(new_chan, 0, sizeof(new_chan)); 269 270 /* Find which uniform vectors are actually used by the program. We 271 * expect unused vector elements when we've moved array access out 272 * to pull constants, and from some GLSL code generators like wine. 273 */ 274 foreach_list(node, &this->instructions) { 275 vec4_instruction *inst = (vec4_instruction *)node; 276 277 for (int i = 0 ; i < 3; i++) { 278 if (inst->src[i].file != UNIFORM) 279 continue; 280 281 uniform_used[inst->src[i].reg] = true; 282 } 283 } 284 285 int new_uniform_count = 0; 286 287 /* Now, figure out a packing of the live uniform vectors into our 288 * push constants. 289 */ 290 for (int src = 0; src < uniforms; src++) { 291 int size = this->uniform_vector_size[src]; 292 293 if (!uniform_used[src]) { 294 this->uniform_vector_size[src] = 0; 295 continue; 296 } 297 298 int dst; 299 /* Find the lowest place we can slot this uniform in. */ 300 for (dst = 0; dst < src; dst++) { 301 if (this->uniform_vector_size[dst] + size <= 4) 302 break; 303 } 304 305 if (src == dst) { 306 new_loc[src] = dst; 307 new_chan[src] = 0; 308 } else { 309 new_loc[src] = dst; 310 new_chan[src] = this->uniform_vector_size[dst]; 311 312 /* Move the references to the data */ 313 for (int j = 0; j < size; j++) { 314 c->prog_data.param[dst * 4 + new_chan[src] + j] = 315 c->prog_data.param[src * 4 + j]; 316 } 317 318 this->uniform_vector_size[dst] += size; 319 this->uniform_vector_size[src] = 0; 320 } 321 322 new_uniform_count = MAX2(new_uniform_count, dst + 1); 323 } 324 325 this->uniforms = new_uniform_count; 326 327 /* Now, update the instructions for our repacked uniforms. */ 328 foreach_list(node, &this->instructions) { 329 vec4_instruction *inst = (vec4_instruction *)node; 330 331 for (int i = 0 ; i < 3; i++) { 332 int src = inst->src[i].reg; 333 334 if (inst->src[i].file != UNIFORM) 335 continue; 336 337 inst->src[i].reg = new_loc[src]; 338 339 int sx = BRW_GET_SWZ(inst->src[i].swizzle, 0) + new_chan[src]; 340 int sy = BRW_GET_SWZ(inst->src[i].swizzle, 1) + new_chan[src]; 341 int sz = BRW_GET_SWZ(inst->src[i].swizzle, 2) + new_chan[src]; 342 int sw = BRW_GET_SWZ(inst->src[i].swizzle, 3) + new_chan[src]; 343 inst->src[i].swizzle = BRW_SWIZZLE4(sx, sy, sz, sw); 344 } 345 } 346} 347 348bool 349src_reg::is_zero() const 350{ 351 if (file != IMM) 352 return false; 353 354 if (type == BRW_REGISTER_TYPE_F) { 355 return imm.f == 0.0; 356 } else { 357 return imm.i == 0; 358 } 359} 360 361bool 362src_reg::is_one() const 363{ 364 if (file != IMM) 365 return false; 366 367 if (type == BRW_REGISTER_TYPE_F) { 368 return imm.f == 1.0; 369 } else { 370 return imm.i == 1; 371 } 372} 373 374/** 375 * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a). 376 * 377 * While GLSL IR also performs this optimization, we end up with it in 378 * our instruction stream for a couple of reasons. One is that we 379 * sometimes generate silly instructions, for example in array access 380 * where we'll generate "ADD offset, index, base" even if base is 0. 381 * The other is that GLSL IR's constant propagation doesn't track the 382 * components of aggregates, so some VS patterns (initialize matrix to 383 * 0, accumulate in vertex blending factors) end up breaking down to 384 * instructions involving 0. 385 */ 386bool 387vec4_visitor::opt_algebraic() 388{ 389 bool progress = false; 390 391 foreach_list(node, &this->instructions) { 392 vec4_instruction *inst = (vec4_instruction *)node; 393 394 switch (inst->opcode) { 395 case BRW_OPCODE_ADD: 396 if (inst->src[1].is_zero()) { 397 inst->opcode = BRW_OPCODE_MOV; 398 inst->src[1] = src_reg(); 399 progress = true; 400 } 401 break; 402 403 case BRW_OPCODE_MUL: 404 if (inst->src[1].is_zero()) { 405 inst->opcode = BRW_OPCODE_MOV; 406 switch (inst->src[0].type) { 407 case BRW_REGISTER_TYPE_F: 408 inst->src[0] = src_reg(0.0f); 409 break; 410 case BRW_REGISTER_TYPE_D: 411 inst->src[0] = src_reg(0); 412 break; 413 case BRW_REGISTER_TYPE_UD: 414 inst->src[0] = src_reg(0u); 415 break; 416 default: 417 assert(!"not reached"); 418 inst->src[0] = src_reg(0.0f); 419 break; 420 } 421 inst->src[1] = src_reg(); 422 progress = true; 423 } else if (inst->src[1].is_one()) { 424 inst->opcode = BRW_OPCODE_MOV; 425 inst->src[1] = src_reg(); 426 progress = true; 427 } 428 break; 429 default: 430 break; 431 } 432 } 433 434 if (progress) 435 this->live_intervals_valid = false; 436 437 return progress; 438} 439 440/** 441 * Only a limited number of hardware registers may be used for push 442 * constants, so this turns access to the overflowed constants into 443 * pull constants. 444 */ 445void 446vec4_visitor::move_push_constants_to_pull_constants() 447{ 448 int pull_constant_loc[this->uniforms]; 449 450 /* Only allow 32 registers (256 uniform components) as push constants, 451 * which is the limit on gen6. 452 */ 453 int max_uniform_components = 32 * 8; 454 if (this->uniforms * 4 <= max_uniform_components) 455 return; 456 457 /* Make some sort of choice as to which uniforms get sent to pull 458 * constants. We could potentially do something clever here like 459 * look for the most infrequently used uniform vec4s, but leave 460 * that for later. 461 */ 462 for (int i = 0; i < this->uniforms * 4; i += 4) { 463 pull_constant_loc[i / 4] = -1; 464 465 if (i >= max_uniform_components) { 466 const float **values = &prog_data->param[i]; 467 468 /* Try to find an existing copy of this uniform in the pull 469 * constants if it was part of an array access already. 470 */ 471 for (unsigned int j = 0; j < prog_data->nr_pull_params; j += 4) { 472 int matches; 473 474 for (matches = 0; matches < 4; matches++) { 475 if (prog_data->pull_param[j + matches] != values[matches]) 476 break; 477 } 478 479 if (matches == 4) { 480 pull_constant_loc[i / 4] = j / 4; 481 break; 482 } 483 } 484 485 if (pull_constant_loc[i / 4] == -1) { 486 assert(prog_data->nr_pull_params % 4 == 0); 487 pull_constant_loc[i / 4] = prog_data->nr_pull_params / 4; 488 489 for (int j = 0; j < 4; j++) { 490 prog_data->pull_param[prog_data->nr_pull_params++] = values[j]; 491 } 492 } 493 } 494 } 495 496 /* Now actually rewrite usage of the things we've moved to pull 497 * constants. 498 */ 499 foreach_list_safe(node, &this->instructions) { 500 vec4_instruction *inst = (vec4_instruction *)node; 501 502 for (int i = 0 ; i < 3; i++) { 503 if (inst->src[i].file != UNIFORM || 504 pull_constant_loc[inst->src[i].reg] == -1) 505 continue; 506 507 int uniform = inst->src[i].reg; 508 509 dst_reg temp = dst_reg(this, glsl_type::vec4_type); 510 511 emit_pull_constant_load(inst, temp, inst->src[i], 512 pull_constant_loc[uniform]); 513 514 inst->src[i].file = temp.file; 515 inst->src[i].reg = temp.reg; 516 inst->src[i].reg_offset = temp.reg_offset; 517 inst->src[i].reladdr = NULL; 518 } 519 } 520 521 /* Repack push constants to remove the now-unused ones. */ 522 pack_uniform_registers(); 523} 524 525/* 526 * Tries to reduce extra MOV instructions by taking GRFs that get just 527 * written and then MOVed into an MRF and making the original write of 528 * the GRF write directly to the MRF instead. 529 */ 530bool 531vec4_visitor::opt_compute_to_mrf() 532{ 533 bool progress = false; 534 int next_ip = 0; 535 536 calculate_live_intervals(); 537 538 foreach_list_safe(node, &this->instructions) { 539 vec4_instruction *inst = (vec4_instruction *)node; 540 541 int ip = next_ip; 542 next_ip++; 543 544 if (inst->opcode != BRW_OPCODE_MOV || 545 inst->predicate || 546 inst->dst.file != MRF || inst->src[0].file != GRF || 547 inst->dst.type != inst->src[0].type || 548 inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr) 549 continue; 550 551 int mrf = inst->dst.reg; 552 553 /* Can't compute-to-MRF this GRF if someone else was going to 554 * read it later. 555 */ 556 if (this->virtual_grf_use[inst->src[0].reg] > ip) 557 continue; 558 559 /* We need to check interference with the MRF between this 560 * instruction and the earliest instruction involved in writing 561 * the GRF we're eliminating. To do that, keep track of which 562 * of our source channels we've seen initialized. 563 */ 564 bool chans_needed[4] = {false, false, false, false}; 565 int chans_remaining = 0; 566 for (int i = 0; i < 4; i++) { 567 int chan = BRW_GET_SWZ(inst->src[0].swizzle, i); 568 569 if (!(inst->dst.writemask & (1 << i))) 570 continue; 571 572 /* We don't handle compute-to-MRF across a swizzle. We would 573 * need to be able to rewrite instructions above to output 574 * results to different channels. 575 */ 576 if (chan != i) 577 chans_remaining = 5; 578 579 if (!chans_needed[chan]) { 580 chans_needed[chan] = true; 581 chans_remaining++; 582 } 583 } 584 if (chans_remaining > 4) 585 continue; 586 587 /* Now walk up the instruction stream trying to see if we can 588 * rewrite everything writing to the GRF into the MRF instead. 589 */ 590 vec4_instruction *scan_inst; 591 for (scan_inst = (vec4_instruction *)inst->prev; 592 scan_inst->prev != NULL; 593 scan_inst = (vec4_instruction *)scan_inst->prev) { 594 if (scan_inst->dst.file == GRF && 595 scan_inst->dst.reg == inst->src[0].reg && 596 scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 597 /* Found something writing to the reg we want to turn into 598 * a compute-to-MRF. 599 */ 600 601 /* SEND instructions can't have MRF as a destination. */ 602 if (scan_inst->mlen) 603 break; 604 605 if (intel->gen >= 6) { 606 /* gen6 math instructions must have the destination be 607 * GRF, so no compute-to-MRF for them. 608 */ 609 if (scan_inst->is_math()) { 610 break; 611 } 612 } 613 614 /* Mark which channels we found unconditional writes for. */ 615 if (!scan_inst->predicate) { 616 for (int i = 0; i < 4; i++) { 617 if (scan_inst->dst.writemask & (1 << i) && 618 chans_needed[i]) { 619 chans_needed[i] = false; 620 chans_remaining--; 621 } 622 } 623 } 624 625 if (chans_remaining == 0) 626 break; 627 } 628 629 /* We don't handle flow control here. Most computation of 630 * values that end up in MRFs are shortly before the MRF 631 * write anyway. 632 */ 633 if (scan_inst->opcode == BRW_OPCODE_DO || 634 scan_inst->opcode == BRW_OPCODE_WHILE || 635 scan_inst->opcode == BRW_OPCODE_ELSE || 636 scan_inst->opcode == BRW_OPCODE_ENDIF) { 637 break; 638 } 639 640 /* You can't read from an MRF, so if someone else reads our 641 * MRF's source GRF that we wanted to rewrite, that stops us. 642 */ 643 bool interfered = false; 644 for (int i = 0; i < 3; i++) { 645 if (scan_inst->src[i].file == GRF && 646 scan_inst->src[i].reg == inst->src[0].reg && 647 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { 648 interfered = true; 649 } 650 } 651 if (interfered) 652 break; 653 654 /* If somebody else writes our MRF here, we can't 655 * compute-to-MRF before that. 656 */ 657 if (scan_inst->dst.file == MRF && mrf == scan_inst->dst.reg) 658 break; 659 660 if (scan_inst->mlen > 0) { 661 /* Found a SEND instruction, which means that there are 662 * live values in MRFs from base_mrf to base_mrf + 663 * scan_inst->mlen - 1. Don't go pushing our MRF write up 664 * above it. 665 */ 666 if (mrf >= scan_inst->base_mrf && 667 mrf < scan_inst->base_mrf + scan_inst->mlen) { 668 break; 669 } 670 } 671 } 672 673 if (chans_remaining == 0) { 674 /* If we've made it here, we have an inst we want to 675 * compute-to-MRF, and a scan_inst pointing to the earliest 676 * instruction involved in computing the value. Now go 677 * rewrite the instruction stream between the two. 678 */ 679 680 while (scan_inst != inst) { 681 if (scan_inst->dst.file == GRF && 682 scan_inst->dst.reg == inst->src[0].reg && 683 scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 684 scan_inst->dst.file = MRF; 685 scan_inst->dst.reg = mrf; 686 scan_inst->dst.reg_offset = 0; 687 scan_inst->saturate |= inst->saturate; 688 } 689 scan_inst = (vec4_instruction *)scan_inst->next; 690 } 691 inst->remove(); 692 progress = true; 693 } 694 } 695 696 if (progress) 697 live_intervals_valid = false; 698 699 return progress; 700} 701 702} /* namespace brw */ 703