brw_vec4.cpp revision 7e7c40ff98cc2b930bc3113609ace5430f2bdc95
1/* 2 * Copyright © 2011 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "brw_vec4.h" 25extern "C" { 26#include "main/macros.h" 27#include "program/prog_parameter.h" 28} 29 30#define MAX_INSTRUCTION (1 << 30) 31 32namespace brw { 33 34bool 35vec4_instruction::is_tex() 36{ 37 return (opcode == SHADER_OPCODE_TEX || 38 opcode == SHADER_OPCODE_TXD || 39 opcode == SHADER_OPCODE_TXF || 40 opcode == SHADER_OPCODE_TXL || 41 opcode == SHADER_OPCODE_TXS); 42} 43 44bool 45vec4_instruction::is_math() 46{ 47 return (opcode == SHADER_OPCODE_RCP || 48 opcode == SHADER_OPCODE_RSQ || 49 opcode == SHADER_OPCODE_SQRT || 50 opcode == SHADER_OPCODE_EXP2 || 51 opcode == SHADER_OPCODE_LOG2 || 52 opcode == SHADER_OPCODE_SIN || 53 opcode == SHADER_OPCODE_COS || 54 opcode == SHADER_OPCODE_INT_QUOTIENT || 55 opcode == SHADER_OPCODE_INT_REMAINDER || 56 opcode == SHADER_OPCODE_POW); 57} 58/** 59 * Returns how many MRFs an opcode will write over. 60 * 61 * Note that this is not the 0 or 1 implied writes in an actual gen 62 * instruction -- the generate_* functions generate additional MOVs 63 * for setup. 64 */ 65int 66vec4_visitor::implied_mrf_writes(vec4_instruction *inst) 67{ 68 if (inst->mlen == 0) 69 return 0; 70 71 switch (inst->opcode) { 72 case SHADER_OPCODE_RCP: 73 case SHADER_OPCODE_RSQ: 74 case SHADER_OPCODE_SQRT: 75 case SHADER_OPCODE_EXP2: 76 case SHADER_OPCODE_LOG2: 77 case SHADER_OPCODE_SIN: 78 case SHADER_OPCODE_COS: 79 return 1; 80 case SHADER_OPCODE_POW: 81 return 2; 82 case VS_OPCODE_URB_WRITE: 83 return 1; 84 case VS_OPCODE_PULL_CONSTANT_LOAD: 85 return 2; 86 case VS_OPCODE_SCRATCH_READ: 87 return 2; 88 case VS_OPCODE_SCRATCH_WRITE: 89 return 3; 90 default: 91 assert(!"not reached"); 92 return inst->mlen; 93 } 94} 95 96bool 97src_reg::equals(src_reg *r) 98{ 99 return (file == r->file && 100 reg == r->reg && 101 reg_offset == r->reg_offset && 102 type == r->type && 103 negate == r->negate && 104 abs == r->abs && 105 swizzle == r->swizzle && 106 !reladdr && !r->reladdr && 107 memcmp(&fixed_hw_reg, &r->fixed_hw_reg, 108 sizeof(fixed_hw_reg)) == 0 && 109 imm.u == r->imm.u); 110} 111 112void 113vec4_visitor::calculate_live_intervals() 114{ 115 int *def = ralloc_array(mem_ctx, int, virtual_grf_count); 116 int *use = ralloc_array(mem_ctx, int, virtual_grf_count); 117 int loop_depth = 0; 118 int loop_start = 0; 119 120 if (this->live_intervals_valid) 121 return; 122 123 for (int i = 0; i < virtual_grf_count; i++) { 124 def[i] = MAX_INSTRUCTION; 125 use[i] = -1; 126 } 127 128 int ip = 0; 129 foreach_list(node, &this->instructions) { 130 vec4_instruction *inst = (vec4_instruction *)node; 131 132 if (inst->opcode == BRW_OPCODE_DO) { 133 if (loop_depth++ == 0) 134 loop_start = ip; 135 } else if (inst->opcode == BRW_OPCODE_WHILE) { 136 loop_depth--; 137 138 if (loop_depth == 0) { 139 /* Patches up the use of vars marked for being live across 140 * the whole loop. 141 */ 142 for (int i = 0; i < virtual_grf_count; i++) { 143 if (use[i] == loop_start) { 144 use[i] = ip; 145 } 146 } 147 } 148 } else { 149 for (unsigned int i = 0; i < 3; i++) { 150 if (inst->src[i].file == GRF) { 151 int reg = inst->src[i].reg; 152 153 if (!loop_depth) { 154 use[reg] = ip; 155 } else { 156 def[reg] = MIN2(loop_start, def[reg]); 157 use[reg] = loop_start; 158 159 /* Nobody else is going to go smash our start to 160 * later in the loop now, because def[reg] now 161 * points before the bb header. 162 */ 163 } 164 } 165 } 166 if (inst->dst.file == GRF) { 167 int reg = inst->dst.reg; 168 169 if (!loop_depth) { 170 def[reg] = MIN2(def[reg], ip); 171 } else { 172 def[reg] = MIN2(def[reg], loop_start); 173 } 174 } 175 } 176 177 ip++; 178 } 179 180 ralloc_free(this->virtual_grf_def); 181 ralloc_free(this->virtual_grf_use); 182 this->virtual_grf_def = def; 183 this->virtual_grf_use = use; 184 185 this->live_intervals_valid = true; 186} 187 188bool 189vec4_visitor::virtual_grf_interferes(int a, int b) 190{ 191 int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]); 192 int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]); 193 194 /* We can't handle dead register writes here, without iterating 195 * over the whole instruction stream to find every single dead 196 * write to that register to compare to the live interval of the 197 * other register. Just assert that dead_code_eliminate() has been 198 * called. 199 */ 200 assert((this->virtual_grf_use[a] != -1 || 201 this->virtual_grf_def[a] == MAX_INSTRUCTION) && 202 (this->virtual_grf_use[b] != -1 || 203 this->virtual_grf_def[b] == MAX_INSTRUCTION)); 204 205 return start < end; 206} 207 208/** 209 * Must be called after calculate_live_intervales() to remove unused 210 * writes to registers -- register allocation will fail otherwise 211 * because something deffed but not used won't be considered to 212 * interfere with other regs. 213 */ 214bool 215vec4_visitor::dead_code_eliminate() 216{ 217 bool progress = false; 218 int pc = 0; 219 220 calculate_live_intervals(); 221 222 foreach_list_safe(node, &this->instructions) { 223 vec4_instruction *inst = (vec4_instruction *)node; 224 225 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) { 226 inst->remove(); 227 progress = true; 228 } 229 230 pc++; 231 } 232 233 if (progress) 234 live_intervals_valid = false; 235 236 return progress; 237} 238 239void 240vec4_visitor::split_uniform_registers() 241{ 242 /* Prior to this, uniforms have been in an array sized according to 243 * the number of vector uniforms present, sparsely filled (so an 244 * aggregate results in reg indices being skipped over). Now we're 245 * going to cut those aggregates up so each .reg index is one 246 * vector. The goal is to make elimination of unused uniform 247 * components easier later. 248 */ 249 foreach_list(node, &this->instructions) { 250 vec4_instruction *inst = (vec4_instruction *)node; 251 252 for (int i = 0 ; i < 3; i++) { 253 if (inst->src[i].file != UNIFORM) 254 continue; 255 256 assert(!inst->src[i].reladdr); 257 258 inst->src[i].reg += inst->src[i].reg_offset; 259 inst->src[i].reg_offset = 0; 260 } 261 } 262 263 /* Update that everything is now vector-sized. */ 264 for (int i = 0; i < this->uniforms; i++) { 265 this->uniform_size[i] = 1; 266 } 267} 268 269void 270vec4_visitor::pack_uniform_registers() 271{ 272 bool uniform_used[this->uniforms]; 273 int new_loc[this->uniforms]; 274 int new_chan[this->uniforms]; 275 276 memset(uniform_used, 0, sizeof(uniform_used)); 277 memset(new_loc, 0, sizeof(new_loc)); 278 memset(new_chan, 0, sizeof(new_chan)); 279 280 /* Find which uniform vectors are actually used by the program. We 281 * expect unused vector elements when we've moved array access out 282 * to pull constants, and from some GLSL code generators like wine. 283 */ 284 foreach_list(node, &this->instructions) { 285 vec4_instruction *inst = (vec4_instruction *)node; 286 287 for (int i = 0 ; i < 3; i++) { 288 if (inst->src[i].file != UNIFORM) 289 continue; 290 291 uniform_used[inst->src[i].reg] = true; 292 } 293 } 294 295 int new_uniform_count = 0; 296 297 /* Now, figure out a packing of the live uniform vectors into our 298 * push constants. 299 */ 300 for (int src = 0; src < uniforms; src++) { 301 int size = this->uniform_vector_size[src]; 302 303 if (!uniform_used[src]) { 304 this->uniform_vector_size[src] = 0; 305 continue; 306 } 307 308 int dst; 309 /* Find the lowest place we can slot this uniform in. */ 310 for (dst = 0; dst < src; dst++) { 311 if (this->uniform_vector_size[dst] + size <= 4) 312 break; 313 } 314 315 if (src == dst) { 316 new_loc[src] = dst; 317 new_chan[src] = 0; 318 } else { 319 new_loc[src] = dst; 320 new_chan[src] = this->uniform_vector_size[dst]; 321 322 /* Move the references to the data */ 323 for (int j = 0; j < size; j++) { 324 c->prog_data.param[dst * 4 + new_chan[src] + j] = 325 c->prog_data.param[src * 4 + j]; 326 } 327 328 this->uniform_vector_size[dst] += size; 329 this->uniform_vector_size[src] = 0; 330 } 331 332 new_uniform_count = MAX2(new_uniform_count, dst + 1); 333 } 334 335 this->uniforms = new_uniform_count; 336 337 /* Now, update the instructions for our repacked uniforms. */ 338 foreach_list(node, &this->instructions) { 339 vec4_instruction *inst = (vec4_instruction *)node; 340 341 for (int i = 0 ; i < 3; i++) { 342 int src = inst->src[i].reg; 343 344 if (inst->src[i].file != UNIFORM) 345 continue; 346 347 inst->src[i].reg = new_loc[src]; 348 349 int sx = BRW_GET_SWZ(inst->src[i].swizzle, 0) + new_chan[src]; 350 int sy = BRW_GET_SWZ(inst->src[i].swizzle, 1) + new_chan[src]; 351 int sz = BRW_GET_SWZ(inst->src[i].swizzle, 2) + new_chan[src]; 352 int sw = BRW_GET_SWZ(inst->src[i].swizzle, 3) + new_chan[src]; 353 inst->src[i].swizzle = BRW_SWIZZLE4(sx, sy, sz, sw); 354 } 355 } 356} 357 358bool 359src_reg::is_zero() const 360{ 361 if (file != IMM) 362 return false; 363 364 if (type == BRW_REGISTER_TYPE_F) { 365 return imm.f == 0.0; 366 } else { 367 return imm.i == 0; 368 } 369} 370 371bool 372src_reg::is_one() const 373{ 374 if (file != IMM) 375 return false; 376 377 if (type == BRW_REGISTER_TYPE_F) { 378 return imm.f == 1.0; 379 } else { 380 return imm.i == 1; 381 } 382} 383 384/** 385 * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a). 386 * 387 * While GLSL IR also performs this optimization, we end up with it in 388 * our instruction stream for a couple of reasons. One is that we 389 * sometimes generate silly instructions, for example in array access 390 * where we'll generate "ADD offset, index, base" even if base is 0. 391 * The other is that GLSL IR's constant propagation doesn't track the 392 * components of aggregates, so some VS patterns (initialize matrix to 393 * 0, accumulate in vertex blending factors) end up breaking down to 394 * instructions involving 0. 395 */ 396bool 397vec4_visitor::opt_algebraic() 398{ 399 bool progress = false; 400 401 foreach_list(node, &this->instructions) { 402 vec4_instruction *inst = (vec4_instruction *)node; 403 404 switch (inst->opcode) { 405 case BRW_OPCODE_ADD: 406 if (inst->src[1].is_zero()) { 407 inst->opcode = BRW_OPCODE_MOV; 408 inst->src[1] = src_reg(); 409 progress = true; 410 } 411 break; 412 413 case BRW_OPCODE_MUL: 414 if (inst->src[1].is_zero()) { 415 inst->opcode = BRW_OPCODE_MOV; 416 switch (inst->src[0].type) { 417 case BRW_REGISTER_TYPE_F: 418 inst->src[0] = src_reg(0.0f); 419 break; 420 case BRW_REGISTER_TYPE_D: 421 inst->src[0] = src_reg(0); 422 break; 423 case BRW_REGISTER_TYPE_UD: 424 inst->src[0] = src_reg(0u); 425 break; 426 default: 427 assert(!"not reached"); 428 inst->src[0] = src_reg(0.0f); 429 break; 430 } 431 inst->src[1] = src_reg(); 432 progress = true; 433 } else if (inst->src[1].is_one()) { 434 inst->opcode = BRW_OPCODE_MOV; 435 inst->src[1] = src_reg(); 436 progress = true; 437 } 438 break; 439 default: 440 break; 441 } 442 } 443 444 if (progress) 445 this->live_intervals_valid = false; 446 447 return progress; 448} 449 450/** 451 * Only a limited number of hardware registers may be used for push 452 * constants, so this turns access to the overflowed constants into 453 * pull constants. 454 */ 455void 456vec4_visitor::move_push_constants_to_pull_constants() 457{ 458 int pull_constant_loc[this->uniforms]; 459 460 /* Only allow 32 registers (256 uniform components) as push constants, 461 * which is the limit on gen6. 462 */ 463 int max_uniform_components = 32 * 8; 464 if (this->uniforms * 4 <= max_uniform_components) 465 return; 466 467 /* Make some sort of choice as to which uniforms get sent to pull 468 * constants. We could potentially do something clever here like 469 * look for the most infrequently used uniform vec4s, but leave 470 * that for later. 471 */ 472 for (int i = 0; i < this->uniforms * 4; i += 4) { 473 pull_constant_loc[i / 4] = -1; 474 475 if (i >= max_uniform_components) { 476 const float **values = &prog_data->param[i]; 477 478 /* Try to find an existing copy of this uniform in the pull 479 * constants if it was part of an array access already. 480 */ 481 for (unsigned int j = 0; j < prog_data->nr_pull_params; j += 4) { 482 int matches; 483 484 for (matches = 0; matches < 4; matches++) { 485 if (prog_data->pull_param[j + matches] != values[matches]) 486 break; 487 } 488 489 if (matches == 4) { 490 pull_constant_loc[i / 4] = j / 4; 491 break; 492 } 493 } 494 495 if (pull_constant_loc[i / 4] == -1) { 496 assert(prog_data->nr_pull_params % 4 == 0); 497 pull_constant_loc[i / 4] = prog_data->nr_pull_params / 4; 498 499 for (int j = 0; j < 4; j++) { 500 prog_data->pull_param[prog_data->nr_pull_params++] = values[j]; 501 } 502 } 503 } 504 } 505 506 /* Now actually rewrite usage of the things we've moved to pull 507 * constants. 508 */ 509 foreach_list_safe(node, &this->instructions) { 510 vec4_instruction *inst = (vec4_instruction *)node; 511 512 for (int i = 0 ; i < 3; i++) { 513 if (inst->src[i].file != UNIFORM || 514 pull_constant_loc[inst->src[i].reg] == -1) 515 continue; 516 517 int uniform = inst->src[i].reg; 518 519 dst_reg temp = dst_reg(this, glsl_type::vec4_type); 520 521 emit_pull_constant_load(inst, temp, inst->src[i], 522 pull_constant_loc[uniform]); 523 524 inst->src[i].file = temp.file; 525 inst->src[i].reg = temp.reg; 526 inst->src[i].reg_offset = temp.reg_offset; 527 inst->src[i].reladdr = NULL; 528 } 529 } 530 531 /* Repack push constants to remove the now-unused ones. */ 532 pack_uniform_registers(); 533} 534 535/* 536 * Tries to reduce extra MOV instructions by taking GRFs that get just 537 * written and then MOVed into an MRF and making the original write of 538 * the GRF write directly to the MRF instead. 539 */ 540bool 541vec4_visitor::opt_compute_to_mrf() 542{ 543 bool progress = false; 544 int next_ip = 0; 545 546 calculate_live_intervals(); 547 548 foreach_list_safe(node, &this->instructions) { 549 vec4_instruction *inst = (vec4_instruction *)node; 550 551 int ip = next_ip; 552 next_ip++; 553 554 if (inst->opcode != BRW_OPCODE_MOV || 555 inst->predicate || 556 inst->dst.file != MRF || inst->src[0].file != GRF || 557 inst->dst.type != inst->src[0].type || 558 inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr) 559 continue; 560 561 int mrf = inst->dst.reg; 562 563 /* Can't compute-to-MRF this GRF if someone else was going to 564 * read it later. 565 */ 566 if (this->virtual_grf_use[inst->src[0].reg] > ip) 567 continue; 568 569 /* We need to check interference with the MRF between this 570 * instruction and the earliest instruction involved in writing 571 * the GRF we're eliminating. To do that, keep track of which 572 * of our source channels we've seen initialized. 573 */ 574 bool chans_needed[4] = {false, false, false, false}; 575 int chans_remaining = 0; 576 for (int i = 0; i < 4; i++) { 577 int chan = BRW_GET_SWZ(inst->src[0].swizzle, i); 578 579 if (!(inst->dst.writemask & (1 << i))) 580 continue; 581 582 /* We don't handle compute-to-MRF across a swizzle. We would 583 * need to be able to rewrite instructions above to output 584 * results to different channels. 585 */ 586 if (chan != i) 587 chans_remaining = 5; 588 589 if (!chans_needed[chan]) { 590 chans_needed[chan] = true; 591 chans_remaining++; 592 } 593 } 594 if (chans_remaining > 4) 595 continue; 596 597 /* Now walk up the instruction stream trying to see if we can 598 * rewrite everything writing to the GRF into the MRF instead. 599 */ 600 vec4_instruction *scan_inst; 601 for (scan_inst = (vec4_instruction *)inst->prev; 602 scan_inst->prev != NULL; 603 scan_inst = (vec4_instruction *)scan_inst->prev) { 604 if (scan_inst->dst.file == GRF && 605 scan_inst->dst.reg == inst->src[0].reg && 606 scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 607 /* Found something writing to the reg we want to turn into 608 * a compute-to-MRF. 609 */ 610 611 /* SEND instructions can't have MRF as a destination. */ 612 if (scan_inst->mlen) 613 break; 614 615 if (intel->gen >= 6) { 616 /* gen6 math instructions must have the destination be 617 * GRF, so no compute-to-MRF for them. 618 */ 619 if (scan_inst->is_math()) { 620 break; 621 } 622 } 623 624 /* Mark which channels we found unconditional writes for. */ 625 if (!scan_inst->predicate) { 626 for (int i = 0; i < 4; i++) { 627 if (scan_inst->dst.writemask & (1 << i) && 628 chans_needed[i]) { 629 chans_needed[i] = false; 630 chans_remaining--; 631 } 632 } 633 } 634 635 if (chans_remaining == 0) 636 break; 637 } 638 639 /* We don't handle flow control here. Most computation of 640 * values that end up in MRFs are shortly before the MRF 641 * write anyway. 642 */ 643 if (scan_inst->opcode == BRW_OPCODE_DO || 644 scan_inst->opcode == BRW_OPCODE_WHILE || 645 scan_inst->opcode == BRW_OPCODE_ELSE || 646 scan_inst->opcode == BRW_OPCODE_ENDIF) { 647 break; 648 } 649 650 /* You can't read from an MRF, so if someone else reads our 651 * MRF's source GRF that we wanted to rewrite, that stops us. 652 */ 653 bool interfered = false; 654 for (int i = 0; i < 3; i++) { 655 if (scan_inst->src[i].file == GRF && 656 scan_inst->src[i].reg == inst->src[0].reg && 657 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { 658 interfered = true; 659 } 660 } 661 if (interfered) 662 break; 663 664 /* If somebody else writes our MRF here, we can't 665 * compute-to-MRF before that. 666 */ 667 if (scan_inst->dst.file == MRF && mrf == scan_inst->dst.reg) 668 break; 669 670 if (scan_inst->mlen > 0) { 671 /* Found a SEND instruction, which means that there are 672 * live values in MRFs from base_mrf to base_mrf + 673 * scan_inst->mlen - 1. Don't go pushing our MRF write up 674 * above it. 675 */ 676 if (mrf >= scan_inst->base_mrf && 677 mrf < scan_inst->base_mrf + scan_inst->mlen) { 678 break; 679 } 680 } 681 } 682 683 if (chans_remaining == 0) { 684 /* If we've made it here, we have an inst we want to 685 * compute-to-MRF, and a scan_inst pointing to the earliest 686 * instruction involved in computing the value. Now go 687 * rewrite the instruction stream between the two. 688 */ 689 690 while (scan_inst != inst) { 691 if (scan_inst->dst.file == GRF && 692 scan_inst->dst.reg == inst->src[0].reg && 693 scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 694 scan_inst->dst.file = MRF; 695 scan_inst->dst.reg = mrf; 696 scan_inst->dst.reg_offset = 0; 697 scan_inst->saturate |= inst->saturate; 698 } 699 scan_inst = (vec4_instruction *)scan_inst->next; 700 } 701 inst->remove(); 702 progress = true; 703 } 704 } 705 706 if (progress) 707 live_intervals_valid = false; 708 709 return progress; 710} 711 712} /* namespace brw */ 713