brw_fs.cpp revision 2e5a1a254ed81b1d3efa6064f48183eefac784d0
1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24/** @file brw_fs.cpp 25 * 26 * This file drives the GLSL IR -> LIR translation, contains the 27 * optimizations on the LIR, and drives the generation of native code 28 * from the LIR. 29 */ 30 31extern "C" { 32 33#include <sys/types.h> 34 35#include "main/macros.h" 36#include "main/shaderobj.h" 37#include "main/uniforms.h" 38#include "program/prog_parameter.h" 39#include "program/prog_print.h" 40#include "program/register_allocate.h" 41#include "program/sampler.h" 42#include "program/hash_table.h" 43#include "brw_context.h" 44#include "brw_eu.h" 45#include "brw_wm.h" 46} 47#include "brw_shader.h" 48#include "brw_fs.h" 49#include "glsl/glsl_types.h" 50#include "glsl/ir_print_visitor.h" 51 52#define MAX_INSTRUCTION (1 << 30) 53 54int 55fs_visitor::type_size(const struct glsl_type *type) 56{ 57 unsigned int size, i; 58 59 switch (type->base_type) { 60 case GLSL_TYPE_UINT: 61 case GLSL_TYPE_INT: 62 case GLSL_TYPE_FLOAT: 63 case GLSL_TYPE_BOOL: 64 return type->components(); 65 case GLSL_TYPE_ARRAY: 66 return type_size(type->fields.array) * type->length; 67 case GLSL_TYPE_STRUCT: 68 size = 0; 69 for (i = 0; i < type->length; i++) { 70 size += type_size(type->fields.structure[i].type); 71 } 72 return size; 73 case GLSL_TYPE_SAMPLER: 74 /* Samplers take up no register space, since they're baked in at 75 * link time. 76 */ 77 return 0; 78 default: 79 assert(!"not reached"); 80 return 0; 81 } 82} 83 84void 85fs_visitor::fail(const char *format, ...) 86{ 87 va_list va; 88 char *msg; 89 90 if (failed) 91 return; 92 93 failed = true; 94 95 va_start(va, format); 96 msg = ralloc_vasprintf(mem_ctx, format, va); 97 va_end(va); 98 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg); 99 100 this->fail_msg = msg; 101 102 if (INTEL_DEBUG & DEBUG_WM) { 103 fprintf(stderr, "%s", msg); 104 } 105} 106 107void 108fs_visitor::push_force_uncompressed() 109{ 110 force_uncompressed_stack++; 111} 112 113void 114fs_visitor::pop_force_uncompressed() 115{ 116 force_uncompressed_stack--; 117 assert(force_uncompressed_stack >= 0); 118} 119 120void 121fs_visitor::push_force_sechalf() 122{ 123 force_sechalf_stack++; 124} 125 126void 127fs_visitor::pop_force_sechalf() 128{ 129 force_sechalf_stack--; 130 assert(force_sechalf_stack >= 0); 131} 132 133/** 134 * Returns how many MRFs an FS opcode will write over. 135 * 136 * Note that this is not the 0 or 1 implied writes in an actual gen 137 * instruction -- the FS opcodes often generate MOVs in addition. 138 */ 139int 140fs_visitor::implied_mrf_writes(fs_inst *inst) 141{ 142 if (inst->mlen == 0) 143 return 0; 144 145 switch (inst->opcode) { 146 case SHADER_OPCODE_RCP: 147 case SHADER_OPCODE_RSQ: 148 case SHADER_OPCODE_SQRT: 149 case SHADER_OPCODE_EXP2: 150 case SHADER_OPCODE_LOG2: 151 case SHADER_OPCODE_SIN: 152 case SHADER_OPCODE_COS: 153 return 1 * c->dispatch_width / 8; 154 case SHADER_OPCODE_POW: 155 case SHADER_OPCODE_INT_QUOTIENT: 156 case SHADER_OPCODE_INT_REMAINDER: 157 return 2 * c->dispatch_width / 8; 158 case FS_OPCODE_TEX: 159 case FS_OPCODE_TXB: 160 case FS_OPCODE_TXD: 161 case FS_OPCODE_TXF: 162 case FS_OPCODE_TXL: 163 case FS_OPCODE_TXS: 164 return 1; 165 case FS_OPCODE_FB_WRITE: 166 return 2; 167 case FS_OPCODE_PULL_CONSTANT_LOAD: 168 case FS_OPCODE_UNSPILL: 169 return 1; 170 case FS_OPCODE_SPILL: 171 return 2; 172 default: 173 assert(!"not reached"); 174 return inst->mlen; 175 } 176} 177 178int 179fs_visitor::virtual_grf_alloc(int size) 180{ 181 if (virtual_grf_array_size <= virtual_grf_next) { 182 if (virtual_grf_array_size == 0) 183 virtual_grf_array_size = 16; 184 else 185 virtual_grf_array_size *= 2; 186 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int, 187 virtual_grf_array_size); 188 } 189 virtual_grf_sizes[virtual_grf_next] = size; 190 return virtual_grf_next++; 191} 192 193/** Fixed HW reg constructor. */ 194fs_reg::fs_reg(enum register_file file, int reg) 195{ 196 init(); 197 this->file = file; 198 this->reg = reg; 199 this->type = BRW_REGISTER_TYPE_F; 200} 201 202/** Fixed HW reg constructor. */ 203fs_reg::fs_reg(enum register_file file, int reg, uint32_t type) 204{ 205 init(); 206 this->file = file; 207 this->reg = reg; 208 this->type = type; 209} 210 211/** Automatic reg constructor. */ 212fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type) 213{ 214 init(); 215 216 this->file = GRF; 217 this->reg = v->virtual_grf_alloc(v->type_size(type)); 218 this->reg_offset = 0; 219 this->type = brw_type_for_base_type(type); 220} 221 222fs_reg * 223fs_visitor::variable_storage(ir_variable *var) 224{ 225 return (fs_reg *)hash_table_find(this->variable_ht, var); 226} 227 228void 229import_uniforms_callback(const void *key, 230 void *data, 231 void *closure) 232{ 233 struct hash_table *dst_ht = (struct hash_table *)closure; 234 const fs_reg *reg = (const fs_reg *)data; 235 236 if (reg->file != UNIFORM) 237 return; 238 239 hash_table_insert(dst_ht, data, key); 240} 241 242/* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch. 243 * This brings in those uniform definitions 244 */ 245void 246fs_visitor::import_uniforms(fs_visitor *v) 247{ 248 hash_table_call_foreach(v->variable_ht, 249 import_uniforms_callback, 250 variable_ht); 251 this->params_remap = v->params_remap; 252} 253 254/* Our support for uniforms is piggy-backed on the struct 255 * gl_fragment_program, because that's where the values actually 256 * get stored, rather than in some global gl_shader_program uniform 257 * store. 258 */ 259int 260fs_visitor::setup_uniform_values(int loc, const glsl_type *type) 261{ 262 unsigned int offset = 0; 263 264 if (type->is_matrix()) { 265 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT, 266 type->vector_elements, 267 1); 268 269 for (unsigned int i = 0; i < type->matrix_columns; i++) { 270 offset += setup_uniform_values(loc + offset, column); 271 } 272 273 return offset; 274 } 275 276 switch (type->base_type) { 277 case GLSL_TYPE_FLOAT: 278 case GLSL_TYPE_UINT: 279 case GLSL_TYPE_INT: 280 case GLSL_TYPE_BOOL: 281 for (unsigned int i = 0; i < type->vector_elements; i++) { 282 unsigned int param = c->prog_data.nr_params++; 283 284 assert(param < ARRAY_SIZE(c->prog_data.param)); 285 286 if (ctx->Const.NativeIntegers) { 287 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 288 } else { 289 switch (type->base_type) { 290 case GLSL_TYPE_FLOAT: 291 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 292 break; 293 case GLSL_TYPE_UINT: 294 c->prog_data.param_convert[param] = PARAM_CONVERT_F2U; 295 break; 296 case GLSL_TYPE_INT: 297 c->prog_data.param_convert[param] = PARAM_CONVERT_F2I; 298 break; 299 case GLSL_TYPE_BOOL: 300 c->prog_data.param_convert[param] = PARAM_CONVERT_F2B; 301 break; 302 default: 303 assert(!"not reached"); 304 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 305 break; 306 } 307 } 308 this->param_index[param] = loc; 309 this->param_offset[param] = i; 310 } 311 return 1; 312 313 case GLSL_TYPE_STRUCT: 314 for (unsigned int i = 0; i < type->length; i++) { 315 offset += setup_uniform_values(loc + offset, 316 type->fields.structure[i].type); 317 } 318 return offset; 319 320 case GLSL_TYPE_ARRAY: 321 for (unsigned int i = 0; i < type->length; i++) { 322 offset += setup_uniform_values(loc + offset, type->fields.array); 323 } 324 return offset; 325 326 case GLSL_TYPE_SAMPLER: 327 /* The sampler takes up a slot, but we don't use any values from it. */ 328 return 1; 329 330 default: 331 assert(!"not reached"); 332 return 0; 333 } 334} 335 336 337/* Our support for builtin uniforms is even scarier than non-builtin. 338 * It sits on top of the PROG_STATE_VAR parameters that are 339 * automatically updated from GL context state. 340 */ 341void 342fs_visitor::setup_builtin_uniform_values(ir_variable *ir) 343{ 344 const ir_state_slot *const slots = ir->state_slots; 345 assert(ir->state_slots != NULL); 346 347 for (unsigned int i = 0; i < ir->num_state_slots; i++) { 348 /* This state reference has already been setup by ir_to_mesa, but we'll 349 * get the same index back here. 350 */ 351 int index = _mesa_add_state_reference(this->fp->Base.Parameters, 352 (gl_state_index *)slots[i].tokens); 353 354 /* Add each of the unique swizzles of the element as a parameter. 355 * This'll end up matching the expected layout of the 356 * array/matrix/structure we're trying to fill in. 357 */ 358 int last_swiz = -1; 359 for (unsigned int j = 0; j < 4; j++) { 360 int swiz = GET_SWZ(slots[i].swizzle, j); 361 if (swiz == last_swiz) 362 break; 363 last_swiz = swiz; 364 365 c->prog_data.param_convert[c->prog_data.nr_params] = 366 PARAM_NO_CONVERT; 367 this->param_index[c->prog_data.nr_params] = index; 368 this->param_offset[c->prog_data.nr_params] = swiz; 369 c->prog_data.nr_params++; 370 } 371 } 372} 373 374fs_reg * 375fs_visitor::emit_fragcoord_interpolation(ir_variable *ir) 376{ 377 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 378 fs_reg wpos = *reg; 379 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo; 380 381 /* gl_FragCoord.x */ 382 if (ir->pixel_center_integer) { 383 emit(BRW_OPCODE_MOV, wpos, this->pixel_x); 384 } else { 385 emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f)); 386 } 387 wpos.reg_offset++; 388 389 /* gl_FragCoord.y */ 390 if (!flip && ir->pixel_center_integer) { 391 emit(BRW_OPCODE_MOV, wpos, this->pixel_y); 392 } else { 393 fs_reg pixel_y = this->pixel_y; 394 float offset = (ir->pixel_center_integer ? 0.0 : 0.5); 395 396 if (flip) { 397 pixel_y.negate = true; 398 offset += c->key.drawable_height - 1.0; 399 } 400 401 emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset)); 402 } 403 wpos.reg_offset++; 404 405 /* gl_FragCoord.z */ 406 if (intel->gen >= 6) { 407 emit(BRW_OPCODE_MOV, wpos, 408 fs_reg(brw_vec8_grf(c->source_depth_reg, 0))); 409 } else { 410 emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y, 411 interp_reg(FRAG_ATTRIB_WPOS, 2)); 412 } 413 wpos.reg_offset++; 414 415 /* gl_FragCoord.w: Already set up in emit_interpolation */ 416 emit(BRW_OPCODE_MOV, wpos, this->wpos_w); 417 418 return reg; 419} 420 421fs_reg * 422fs_visitor::emit_general_interpolation(ir_variable *ir) 423{ 424 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 425 /* Interpolation is always in floating point regs. */ 426 reg->type = BRW_REGISTER_TYPE_F; 427 fs_reg attr = *reg; 428 429 unsigned int array_elements; 430 const glsl_type *type; 431 432 if (ir->type->is_array()) { 433 array_elements = ir->type->length; 434 if (array_elements == 0) { 435 fail("dereferenced array '%s' has length 0\n", ir->name); 436 } 437 type = ir->type->fields.array; 438 } else { 439 array_elements = 1; 440 type = ir->type; 441 } 442 443 int location = ir->location; 444 for (unsigned int i = 0; i < array_elements; i++) { 445 for (unsigned int j = 0; j < type->matrix_columns; j++) { 446 if (urb_setup[location] == -1) { 447 /* If there's no incoming setup data for this slot, don't 448 * emit interpolation for it. 449 */ 450 attr.reg_offset += type->vector_elements; 451 location++; 452 continue; 453 } 454 455 bool is_gl_Color = 456 location == FRAG_ATTRIB_COL0 || location == FRAG_ATTRIB_COL1; 457 458 if (c->key.flat_shade && is_gl_Color) { 459 /* Constant interpolation (flat shading) case. The SF has 460 * handed us defined values in only the constant offset 461 * field of the setup reg. 462 */ 463 for (unsigned int k = 0; k < type->vector_elements; k++) { 464 struct brw_reg interp = interp_reg(location, k); 465 interp = suboffset(interp, 3); 466 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp)); 467 attr.reg_offset++; 468 } 469 } else { 470 /* Perspective interpolation case. */ 471 for (unsigned int k = 0; k < type->vector_elements; k++) { 472 /* FINISHME: At some point we probably want to push 473 * this farther by giving similar treatment to the 474 * other potentially constant components of the 475 * attribute, as well as making brw_vs_constval.c 476 * handle varyings other than gl_TexCoord. 477 */ 478 if (location >= FRAG_ATTRIB_TEX0 && 479 location <= FRAG_ATTRIB_TEX7 && 480 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) { 481 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f)); 482 } else { 483 struct brw_reg interp = interp_reg(location, k); 484 emit(FS_OPCODE_LINTERP, attr, 485 this->delta_x, this->delta_y, fs_reg(interp)); 486 } 487 attr.reg_offset++; 488 } 489 490 if (intel->gen < 6) { 491 attr.reg_offset -= type->vector_elements; 492 for (unsigned int k = 0; k < type->vector_elements; k++) { 493 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w); 494 attr.reg_offset++; 495 } 496 } 497 } 498 location++; 499 } 500 } 501 502 return reg; 503} 504 505fs_reg * 506fs_visitor::emit_frontfacing_interpolation(ir_variable *ir) 507{ 508 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 509 510 /* The frontfacing comes in as a bit in the thread payload. */ 511 if (intel->gen >= 6) { 512 emit(BRW_OPCODE_ASR, *reg, 513 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)), 514 fs_reg(15)); 515 emit(BRW_OPCODE_NOT, *reg, *reg); 516 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1)); 517 } else { 518 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); 519 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives 520 * us front face 521 */ 522 fs_inst *inst = emit(BRW_OPCODE_CMP, *reg, 523 fs_reg(r1_6ud), 524 fs_reg(1u << 31)); 525 inst->conditional_mod = BRW_CONDITIONAL_L; 526 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u)); 527 } 528 529 return reg; 530} 531 532fs_inst * 533fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src) 534{ 535 switch (opcode) { 536 case SHADER_OPCODE_RCP: 537 case SHADER_OPCODE_RSQ: 538 case SHADER_OPCODE_SQRT: 539 case SHADER_OPCODE_EXP2: 540 case SHADER_OPCODE_LOG2: 541 case SHADER_OPCODE_SIN: 542 case SHADER_OPCODE_COS: 543 break; 544 default: 545 assert(!"not reached: bad math opcode"); 546 return NULL; 547 } 548 549 /* Can't do hstride == 0 args to gen6 math, so expand it out. We 550 * might be able to do better by doing execsize = 1 math and then 551 * expanding that result out, but we would need to be careful with 552 * masking. 553 * 554 * The hardware ignores source modifiers (negate and abs) on math 555 * instructions, so we also move to a temp to set those up. 556 */ 557 if (intel->gen >= 6 && (src.file == UNIFORM || 558 src.abs || 559 src.negate)) { 560 fs_reg expanded = fs_reg(this, glsl_type::float_type); 561 emit(BRW_OPCODE_MOV, expanded, src); 562 src = expanded; 563 } 564 565 fs_inst *inst = emit(opcode, dst, src); 566 567 if (intel->gen < 6) { 568 inst->base_mrf = 2; 569 inst->mlen = c->dispatch_width / 8; 570 } 571 572 return inst; 573} 574 575fs_inst * 576fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1) 577{ 578 int base_mrf = 2; 579 fs_inst *inst; 580 581 switch (opcode) { 582 case SHADER_OPCODE_POW: 583 case SHADER_OPCODE_INT_QUOTIENT: 584 case SHADER_OPCODE_INT_REMAINDER: 585 break; 586 default: 587 assert(!"not reached: unsupported binary math opcode."); 588 return NULL; 589 } 590 591 if (intel->gen >= 6) { 592 /* Can't do hstride == 0 args to gen6 math, so expand it out. 593 * 594 * The hardware ignores source modifiers (negate and abs) on math 595 * instructions, so we also move to a temp to set those up. 596 */ 597 if (src0.file == UNIFORM || src0.abs || src0.negate) { 598 fs_reg expanded = fs_reg(this, glsl_type::float_type); 599 expanded.type = src0.type; 600 emit(BRW_OPCODE_MOV, expanded, src0); 601 src0 = expanded; 602 } 603 604 if (src1.file == UNIFORM || src1.abs || src1.negate) { 605 fs_reg expanded = fs_reg(this, glsl_type::float_type); 606 expanded.type = src1.type; 607 emit(BRW_OPCODE_MOV, expanded, src1); 608 src1 = expanded; 609 } 610 611 inst = emit(opcode, dst, src0, src1); 612 } else { 613 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13 614 * "Message Payload": 615 * 616 * "Operand0[7]. For the INT DIV functions, this operand is the 617 * denominator." 618 * ... 619 * "Operand1[7]. For the INT DIV functions, this operand is the 620 * numerator." 621 */ 622 bool is_int_div = opcode != SHADER_OPCODE_POW; 623 fs_reg &op0 = is_int_div ? src1 : src0; 624 fs_reg &op1 = is_int_div ? src0 : src1; 625 626 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1); 627 inst = emit(opcode, dst, op0, reg_null_f); 628 629 inst->base_mrf = base_mrf; 630 inst->mlen = 2 * c->dispatch_width / 8; 631 } 632 return inst; 633} 634 635/** 636 * To be called after the last _mesa_add_state_reference() call, to 637 * set up prog_data.param[] for assign_curb_setup() and 638 * setup_pull_constants(). 639 */ 640void 641fs_visitor::setup_paramvalues_refs() 642{ 643 if (c->dispatch_width != 8) 644 return; 645 646 /* Set up the pointers to ParamValues now that that array is finalized. */ 647 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 648 c->prog_data.param[i] = 649 (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] + 650 this->param_offset[i]; 651 } 652} 653 654void 655fs_visitor::assign_curb_setup() 656{ 657 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8; 658 if (c->dispatch_width == 8) { 659 c->prog_data.first_curbe_grf = c->nr_payload_regs; 660 } else { 661 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs; 662 } 663 664 /* Map the offsets in the UNIFORM file to fixed HW regs. */ 665 foreach_list(node, &this->instructions) { 666 fs_inst *inst = (fs_inst *)node; 667 668 for (unsigned int i = 0; i < 3; i++) { 669 if (inst->src[i].file == UNIFORM) { 670 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset; 671 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs + 672 constant_nr / 8, 673 constant_nr % 8); 674 675 inst->src[i].file = FIXED_HW_REG; 676 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type); 677 } 678 } 679 } 680} 681 682void 683fs_visitor::calculate_urb_setup() 684{ 685 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 686 urb_setup[i] = -1; 687 } 688 689 int urb_next = 0; 690 /* Figure out where each of the incoming setup attributes lands. */ 691 if (intel->gen >= 6) { 692 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 693 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) { 694 urb_setup[i] = urb_next++; 695 } 696 } 697 } else { 698 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */ 699 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) { 700 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) { 701 int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i); 702 703 if (fp_index >= 0) 704 urb_setup[fp_index] = urb_next++; 705 } 706 } 707 } 708 709 /* Each attribute is 4 setup channels, each of which is half a reg. */ 710 c->prog_data.urb_read_length = urb_next * 2; 711} 712 713void 714fs_visitor::assign_urb_setup() 715{ 716 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length; 717 718 /* Offset all the urb_setup[] index by the actual position of the 719 * setup regs, now that the location of the constants has been chosen. 720 */ 721 foreach_list(node, &this->instructions) { 722 fs_inst *inst = (fs_inst *)node; 723 724 if (inst->opcode == FS_OPCODE_LINTERP) { 725 assert(inst->src[2].file == FIXED_HW_REG); 726 inst->src[2].fixed_hw_reg.nr += urb_start; 727 } 728 729 if (inst->opcode == FS_OPCODE_CINTERP) { 730 assert(inst->src[0].file == FIXED_HW_REG); 731 inst->src[0].fixed_hw_reg.nr += urb_start; 732 } 733 } 734 735 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length; 736} 737 738/** 739 * Split large virtual GRFs into separate components if we can. 740 * 741 * This is mostly duplicated with what brw_fs_vector_splitting does, 742 * but that's really conservative because it's afraid of doing 743 * splitting that doesn't result in real progress after the rest of 744 * the optimization phases, which would cause infinite looping in 745 * optimization. We can do it once here, safely. This also has the 746 * opportunity to split interpolated values, or maybe even uniforms, 747 * which we don't have at the IR level. 748 * 749 * We want to split, because virtual GRFs are what we register 750 * allocate and spill (due to contiguousness requirements for some 751 * instructions), and they're what we naturally generate in the 752 * codegen process, but most virtual GRFs don't actually need to be 753 * contiguous sets of GRFs. If we split, we'll end up with reduced 754 * live intervals and better dead code elimination and coalescing. 755 */ 756void 757fs_visitor::split_virtual_grfs() 758{ 759 int num_vars = this->virtual_grf_next; 760 bool split_grf[num_vars]; 761 int new_virtual_grf[num_vars]; 762 763 /* Try to split anything > 0 sized. */ 764 for (int i = 0; i < num_vars; i++) { 765 if (this->virtual_grf_sizes[i] != 1) 766 split_grf[i] = true; 767 else 768 split_grf[i] = false; 769 } 770 771 if (brw->has_pln) { 772 /* PLN opcodes rely on the delta_xy being contiguous. */ 773 split_grf[this->delta_x.reg] = false; 774 } 775 776 foreach_list(node, &this->instructions) { 777 fs_inst *inst = (fs_inst *)node; 778 779 /* Texturing produces 4 contiguous registers, so no splitting. */ 780 if (inst->is_tex()) { 781 split_grf[inst->dst.reg] = false; 782 } 783 } 784 785 /* Allocate new space for split regs. Note that the virtual 786 * numbers will be contiguous. 787 */ 788 for (int i = 0; i < num_vars; i++) { 789 if (split_grf[i]) { 790 new_virtual_grf[i] = virtual_grf_alloc(1); 791 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) { 792 int reg = virtual_grf_alloc(1); 793 assert(reg == new_virtual_grf[i] + j - 1); 794 (void) reg; 795 } 796 this->virtual_grf_sizes[i] = 1; 797 } 798 } 799 800 foreach_list(node, &this->instructions) { 801 fs_inst *inst = (fs_inst *)node; 802 803 if (inst->dst.file == GRF && 804 split_grf[inst->dst.reg] && 805 inst->dst.reg_offset != 0) { 806 inst->dst.reg = (new_virtual_grf[inst->dst.reg] + 807 inst->dst.reg_offset - 1); 808 inst->dst.reg_offset = 0; 809 } 810 for (int i = 0; i < 3; i++) { 811 if (inst->src[i].file == GRF && 812 split_grf[inst->src[i].reg] && 813 inst->src[i].reg_offset != 0) { 814 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] + 815 inst->src[i].reg_offset - 1); 816 inst->src[i].reg_offset = 0; 817 } 818 } 819 } 820 this->live_intervals_valid = false; 821} 822 823bool 824fs_visitor::remove_dead_constants() 825{ 826 if (c->dispatch_width == 8) { 827 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params); 828 829 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) 830 this->params_remap[i] = -1; 831 832 /* Find which params are still in use. */ 833 foreach_list(node, &this->instructions) { 834 fs_inst *inst = (fs_inst *)node; 835 836 for (int i = 0; i < 3; i++) { 837 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset; 838 839 if (inst->src[i].file != UNIFORM) 840 continue; 841 842 assert(constant_nr < (int)c->prog_data.nr_params); 843 844 /* For now, set this to non-negative. We'll give it the 845 * actual new number in a moment, in order to keep the 846 * register numbers nicely ordered. 847 */ 848 this->params_remap[constant_nr] = 0; 849 } 850 } 851 852 /* Figure out what the new numbers for the params will be. At some 853 * point when we're doing uniform array access, we're going to want 854 * to keep the distinction between .reg and .reg_offset, but for 855 * now we don't care. 856 */ 857 unsigned int new_nr_params = 0; 858 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 859 if (this->params_remap[i] != -1) { 860 this->params_remap[i] = new_nr_params++; 861 } 862 } 863 864 /* Update the list of params to be uploaded to match our new numbering. */ 865 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 866 int remapped = this->params_remap[i]; 867 868 if (remapped == -1) 869 continue; 870 871 /* We've already done setup_paramvalues_refs() so no need to worry 872 * about param_index and param_offset. 873 */ 874 c->prog_data.param[remapped] = c->prog_data.param[i]; 875 c->prog_data.param_convert[remapped] = c->prog_data.param_convert[i]; 876 } 877 878 c->prog_data.nr_params = new_nr_params; 879 } else { 880 /* This should have been generated in the 8-wide pass already. */ 881 assert(this->params_remap); 882 } 883 884 /* Now do the renumbering of the shader to remove unused params. */ 885 foreach_list(node, &this->instructions) { 886 fs_inst *inst = (fs_inst *)node; 887 888 for (int i = 0; i < 3; i++) { 889 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset; 890 891 if (inst->src[i].file != UNIFORM) 892 continue; 893 894 assert(this->params_remap[constant_nr] != -1); 895 inst->src[i].reg = this->params_remap[constant_nr]; 896 inst->src[i].reg_offset = 0; 897 } 898 } 899 900 return true; 901} 902 903/** 904 * Choose accesses from the UNIFORM file to demote to using the pull 905 * constant buffer. 906 * 907 * We allow a fragment shader to have more than the specified minimum 908 * maximum number of fragment shader uniform components (64). If 909 * there are too many of these, they'd fill up all of register space. 910 * So, this will push some of them out to the pull constant buffer and 911 * update the program to load them. 912 */ 913void 914fs_visitor::setup_pull_constants() 915{ 916 /* Only allow 16 registers (128 uniform components) as push constants. */ 917 unsigned int max_uniform_components = 16 * 8; 918 if (c->prog_data.nr_params <= max_uniform_components) 919 return; 920 921 if (c->dispatch_width == 16) { 922 fail("Pull constants not supported in 16-wide\n"); 923 return; 924 } 925 926 /* Just demote the end of the list. We could probably do better 927 * here, demoting things that are rarely used in the program first. 928 */ 929 int pull_uniform_base = max_uniform_components; 930 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base; 931 932 foreach_list(node, &this->instructions) { 933 fs_inst *inst = (fs_inst *)node; 934 935 for (int i = 0; i < 3; i++) { 936 if (inst->src[i].file != UNIFORM) 937 continue; 938 939 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset; 940 if (uniform_nr < pull_uniform_base) 941 continue; 942 943 fs_reg dst = fs_reg(this, glsl_type::float_type); 944 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD, 945 dst); 946 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15; 947 pull->ir = inst->ir; 948 pull->annotation = inst->annotation; 949 pull->base_mrf = 14; 950 pull->mlen = 1; 951 952 inst->insert_before(pull); 953 954 inst->src[i].file = GRF; 955 inst->src[i].reg = dst.reg; 956 inst->src[i].reg_offset = 0; 957 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3; 958 } 959 } 960 961 for (int i = 0; i < pull_uniform_count; i++) { 962 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i]; 963 c->prog_data.pull_param_convert[i] = 964 c->prog_data.param_convert[pull_uniform_base + i]; 965 } 966 c->prog_data.nr_params -= pull_uniform_count; 967 c->prog_data.nr_pull_params = pull_uniform_count; 968} 969 970void 971fs_visitor::calculate_live_intervals() 972{ 973 int num_vars = this->virtual_grf_next; 974 int *def = ralloc_array(mem_ctx, int, num_vars); 975 int *use = ralloc_array(mem_ctx, int, num_vars); 976 int loop_depth = 0; 977 int loop_start = 0; 978 979 if (this->live_intervals_valid) 980 return; 981 982 for (int i = 0; i < num_vars; i++) { 983 def[i] = MAX_INSTRUCTION; 984 use[i] = -1; 985 } 986 987 int ip = 0; 988 foreach_list(node, &this->instructions) { 989 fs_inst *inst = (fs_inst *)node; 990 991 if (inst->opcode == BRW_OPCODE_DO) { 992 if (loop_depth++ == 0) 993 loop_start = ip; 994 } else if (inst->opcode == BRW_OPCODE_WHILE) { 995 loop_depth--; 996 997 if (loop_depth == 0) { 998 /* Patches up the use of vars marked for being live across 999 * the whole loop. 1000 */ 1001 for (int i = 0; i < num_vars; i++) { 1002 if (use[i] == loop_start) { 1003 use[i] = ip; 1004 } 1005 } 1006 } 1007 } else { 1008 for (unsigned int i = 0; i < 3; i++) { 1009 if (inst->src[i].file == GRF) { 1010 int reg = inst->src[i].reg; 1011 1012 if (!loop_depth) { 1013 use[reg] = ip; 1014 } else { 1015 def[reg] = MIN2(loop_start, def[reg]); 1016 use[reg] = loop_start; 1017 1018 /* Nobody else is going to go smash our start to 1019 * later in the loop now, because def[reg] now 1020 * points before the bb header. 1021 */ 1022 } 1023 } 1024 } 1025 if (inst->dst.file == GRF) { 1026 int reg = inst->dst.reg; 1027 1028 if (!loop_depth) { 1029 def[reg] = MIN2(def[reg], ip); 1030 } else { 1031 def[reg] = MIN2(def[reg], loop_start); 1032 } 1033 } 1034 } 1035 1036 ip++; 1037 } 1038 1039 ralloc_free(this->virtual_grf_def); 1040 ralloc_free(this->virtual_grf_use); 1041 this->virtual_grf_def = def; 1042 this->virtual_grf_use = use; 1043 1044 this->live_intervals_valid = true; 1045} 1046 1047/** 1048 * Attempts to move immediate constants into the immediate 1049 * constant slot of following instructions. 1050 * 1051 * Immediate constants are a bit tricky -- they have to be in the last 1052 * operand slot, you can't do abs/negate on them, 1053 */ 1054 1055bool 1056fs_visitor::propagate_constants() 1057{ 1058 bool progress = false; 1059 1060 calculate_live_intervals(); 1061 1062 foreach_list(node, &this->instructions) { 1063 fs_inst *inst = (fs_inst *)node; 1064 1065 if (inst->opcode != BRW_OPCODE_MOV || 1066 inst->predicated || 1067 inst->dst.file != GRF || inst->src[0].file != IMM || 1068 inst->dst.type != inst->src[0].type || 1069 (c->dispatch_width == 16 && 1070 (inst->force_uncompressed || inst->force_sechalf))) 1071 continue; 1072 1073 /* Don't bother with cases where we should have had the 1074 * operation on the constant folded in GLSL already. 1075 */ 1076 if (inst->saturate) 1077 continue; 1078 1079 /* Found a move of a constant to a GRF. Find anything else using the GRF 1080 * before it's written, and replace it with the constant if we can. 1081 */ 1082 for (fs_inst *scan_inst = (fs_inst *)inst->next; 1083 !scan_inst->is_tail_sentinel(); 1084 scan_inst = (fs_inst *)scan_inst->next) { 1085 if (scan_inst->opcode == BRW_OPCODE_DO || 1086 scan_inst->opcode == BRW_OPCODE_WHILE || 1087 scan_inst->opcode == BRW_OPCODE_ELSE || 1088 scan_inst->opcode == BRW_OPCODE_ENDIF) { 1089 break; 1090 } 1091 1092 for (int i = 2; i >= 0; i--) { 1093 if (scan_inst->src[i].file != GRF || 1094 scan_inst->src[i].reg != inst->dst.reg || 1095 scan_inst->src[i].reg_offset != inst->dst.reg_offset) 1096 continue; 1097 1098 /* Don't bother with cases where we should have had the 1099 * operation on the constant folded in GLSL already. 1100 */ 1101 if (scan_inst->src[i].negate || scan_inst->src[i].abs) 1102 continue; 1103 1104 switch (scan_inst->opcode) { 1105 case BRW_OPCODE_MOV: 1106 scan_inst->src[i] = inst->src[0]; 1107 progress = true; 1108 break; 1109 1110 case BRW_OPCODE_MUL: 1111 case BRW_OPCODE_ADD: 1112 if (i == 1) { 1113 scan_inst->src[i] = inst->src[0]; 1114 progress = true; 1115 } else if (i == 0 && scan_inst->src[1].file != IMM) { 1116 /* Fit this constant in by commuting the operands */ 1117 scan_inst->src[0] = scan_inst->src[1]; 1118 scan_inst->src[1] = inst->src[0]; 1119 progress = true; 1120 } 1121 break; 1122 1123 case BRW_OPCODE_CMP: 1124 if (i == 1) { 1125 scan_inst->src[i] = inst->src[0]; 1126 progress = true; 1127 } else if (i == 0 && scan_inst->src[1].file != IMM) { 1128 uint32_t new_cmod; 1129 1130 new_cmod = brw_swap_cmod(scan_inst->conditional_mod); 1131 if (new_cmod != ~0u) { 1132 /* Fit this constant in by swapping the operands and 1133 * flipping the test 1134 */ 1135 scan_inst->src[0] = scan_inst->src[1]; 1136 scan_inst->src[1] = inst->src[0]; 1137 scan_inst->conditional_mod = new_cmod; 1138 progress = true; 1139 } 1140 } 1141 break; 1142 1143 case BRW_OPCODE_SEL: 1144 if (i == 1) { 1145 scan_inst->src[i] = inst->src[0]; 1146 progress = true; 1147 } else if (i == 0 && scan_inst->src[1].file != IMM) { 1148 scan_inst->src[0] = scan_inst->src[1]; 1149 scan_inst->src[1] = inst->src[0]; 1150 1151 /* If this was predicated, flipping operands means 1152 * we also need to flip the predicate. 1153 */ 1154 if (scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) { 1155 scan_inst->predicate_inverse = 1156 !scan_inst->predicate_inverse; 1157 } 1158 progress = true; 1159 } 1160 break; 1161 1162 case SHADER_OPCODE_RCP: 1163 /* The hardware doesn't do math on immediate values 1164 * (because why are you doing that, seriously?), but 1165 * the correct answer is to just constant fold it 1166 * anyway. 1167 */ 1168 assert(i == 0); 1169 if (inst->src[0].imm.f != 0.0f) { 1170 scan_inst->opcode = BRW_OPCODE_MOV; 1171 scan_inst->src[0] = inst->src[0]; 1172 scan_inst->src[0].imm.f = 1.0f / scan_inst->src[0].imm.f; 1173 progress = true; 1174 } 1175 break; 1176 1177 default: 1178 break; 1179 } 1180 } 1181 1182 if (scan_inst->dst.file == GRF && 1183 scan_inst->dst.reg == inst->dst.reg && 1184 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 1185 scan_inst->is_tex())) { 1186 break; 1187 } 1188 } 1189 } 1190 1191 if (progress) 1192 this->live_intervals_valid = false; 1193 1194 return progress; 1195} 1196 1197 1198/** 1199 * Attempts to move immediate constants into the immediate 1200 * constant slot of following instructions. 1201 * 1202 * Immediate constants are a bit tricky -- they have to be in the last 1203 * operand slot, you can't do abs/negate on them, 1204 */ 1205 1206bool 1207fs_visitor::opt_algebraic() 1208{ 1209 bool progress = false; 1210 1211 calculate_live_intervals(); 1212 1213 foreach_list(node, &this->instructions) { 1214 fs_inst *inst = (fs_inst *)node; 1215 1216 switch (inst->opcode) { 1217 case BRW_OPCODE_MUL: 1218 if (inst->src[1].file != IMM) 1219 continue; 1220 1221 /* a * 1.0 = a */ 1222 if (inst->src[1].type == BRW_REGISTER_TYPE_F && 1223 inst->src[1].imm.f == 1.0) { 1224 inst->opcode = BRW_OPCODE_MOV; 1225 inst->src[1] = reg_undef; 1226 progress = true; 1227 break; 1228 } 1229 1230 break; 1231 default: 1232 break; 1233 } 1234 } 1235 1236 return progress; 1237} 1238 1239/** 1240 * Must be called after calculate_live_intervales() to remove unused 1241 * writes to registers -- register allocation will fail otherwise 1242 * because something deffed but not used won't be considered to 1243 * interfere with other regs. 1244 */ 1245bool 1246fs_visitor::dead_code_eliminate() 1247{ 1248 bool progress = false; 1249 int pc = 0; 1250 1251 calculate_live_intervals(); 1252 1253 foreach_list_safe(node, &this->instructions) { 1254 fs_inst *inst = (fs_inst *)node; 1255 1256 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) { 1257 inst->remove(); 1258 progress = true; 1259 } 1260 1261 pc++; 1262 } 1263 1264 if (progress) 1265 live_intervals_valid = false; 1266 1267 return progress; 1268} 1269 1270bool 1271fs_visitor::register_coalesce() 1272{ 1273 bool progress = false; 1274 int if_depth = 0; 1275 int loop_depth = 0; 1276 1277 foreach_list_safe(node, &this->instructions) { 1278 fs_inst *inst = (fs_inst *)node; 1279 1280 /* Make sure that we dominate the instructions we're going to 1281 * scan for interfering with our coalescing, or we won't have 1282 * scanned enough to see if anything interferes with our 1283 * coalescing. We don't dominate the following instructions if 1284 * we're in a loop or an if block. 1285 */ 1286 switch (inst->opcode) { 1287 case BRW_OPCODE_DO: 1288 loop_depth++; 1289 break; 1290 case BRW_OPCODE_WHILE: 1291 loop_depth--; 1292 break; 1293 case BRW_OPCODE_IF: 1294 if_depth++; 1295 break; 1296 case BRW_OPCODE_ENDIF: 1297 if_depth--; 1298 break; 1299 default: 1300 break; 1301 } 1302 if (loop_depth || if_depth) 1303 continue; 1304 1305 if (inst->opcode != BRW_OPCODE_MOV || 1306 inst->predicated || 1307 inst->saturate || 1308 inst->dst.file != GRF || (inst->src[0].file != GRF && 1309 inst->src[0].file != UNIFORM)|| 1310 inst->dst.type != inst->src[0].type) 1311 continue; 1312 1313 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate; 1314 1315 /* Found a move of a GRF to a GRF. Let's see if we can coalesce 1316 * them: check for no writes to either one until the exit of the 1317 * program. 1318 */ 1319 bool interfered = false; 1320 1321 for (fs_inst *scan_inst = (fs_inst *)inst->next; 1322 !scan_inst->is_tail_sentinel(); 1323 scan_inst = (fs_inst *)scan_inst->next) { 1324 if (scan_inst->dst.file == GRF) { 1325 if (scan_inst->dst.reg == inst->dst.reg && 1326 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 1327 scan_inst->is_tex())) { 1328 interfered = true; 1329 break; 1330 } 1331 if (inst->src[0].file == GRF && 1332 scan_inst->dst.reg == inst->src[0].reg && 1333 (scan_inst->dst.reg_offset == inst->src[0].reg_offset || 1334 scan_inst->is_tex())) { 1335 interfered = true; 1336 break; 1337 } 1338 } 1339 1340 /* The gen6 MATH instruction can't handle source modifiers or 1341 * unusual register regions, so avoid coalescing those for 1342 * now. We should do something more specific. 1343 */ 1344 if (intel->gen >= 6 && 1345 scan_inst->is_math() && 1346 (has_source_modifiers || inst->src[0].file == UNIFORM)) { 1347 interfered = true; 1348 break; 1349 } 1350 } 1351 if (interfered) { 1352 continue; 1353 } 1354 1355 /* Rewrite the later usage to point at the source of the move to 1356 * be removed. 1357 */ 1358 for (fs_inst *scan_inst = inst; 1359 !scan_inst->is_tail_sentinel(); 1360 scan_inst = (fs_inst *)scan_inst->next) { 1361 for (int i = 0; i < 3; i++) { 1362 if (scan_inst->src[i].file == GRF && 1363 scan_inst->src[i].reg == inst->dst.reg && 1364 scan_inst->src[i].reg_offset == inst->dst.reg_offset) { 1365 fs_reg new_src = inst->src[0]; 1366 if (scan_inst->src[i].abs) { 1367 new_src.negate = 0; 1368 new_src.abs = 1; 1369 } 1370 new_src.negate ^= scan_inst->src[i].negate; 1371 scan_inst->src[i] = new_src; 1372 } 1373 } 1374 } 1375 1376 inst->remove(); 1377 progress = true; 1378 } 1379 1380 if (progress) 1381 live_intervals_valid = false; 1382 1383 return progress; 1384} 1385 1386 1387bool 1388fs_visitor::compute_to_mrf() 1389{ 1390 bool progress = false; 1391 int next_ip = 0; 1392 1393 calculate_live_intervals(); 1394 1395 foreach_list_safe(node, &this->instructions) { 1396 fs_inst *inst = (fs_inst *)node; 1397 1398 int ip = next_ip; 1399 next_ip++; 1400 1401 if (inst->opcode != BRW_OPCODE_MOV || 1402 inst->predicated || 1403 inst->dst.file != MRF || inst->src[0].file != GRF || 1404 inst->dst.type != inst->src[0].type || 1405 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1) 1406 continue; 1407 1408 /* Work out which hardware MRF registers are written by this 1409 * instruction. 1410 */ 1411 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4; 1412 int mrf_high; 1413 if (inst->dst.reg & BRW_MRF_COMPR4) { 1414 mrf_high = mrf_low + 4; 1415 } else if (c->dispatch_width == 16 && 1416 (!inst->force_uncompressed && !inst->force_sechalf)) { 1417 mrf_high = mrf_low + 1; 1418 } else { 1419 mrf_high = mrf_low; 1420 } 1421 1422 /* Can't compute-to-MRF this GRF if someone else was going to 1423 * read it later. 1424 */ 1425 if (this->virtual_grf_use[inst->src[0].reg] > ip) 1426 continue; 1427 1428 /* Found a move of a GRF to a MRF. Let's see if we can go 1429 * rewrite the thing that made this GRF to write into the MRF. 1430 */ 1431 fs_inst *scan_inst; 1432 for (scan_inst = (fs_inst *)inst->prev; 1433 scan_inst->prev != NULL; 1434 scan_inst = (fs_inst *)scan_inst->prev) { 1435 if (scan_inst->dst.file == GRF && 1436 scan_inst->dst.reg == inst->src[0].reg) { 1437 /* Found the last thing to write our reg we want to turn 1438 * into a compute-to-MRF. 1439 */ 1440 1441 if (scan_inst->is_tex()) { 1442 /* texturing writes several continuous regs, so we can't 1443 * compute-to-mrf that. 1444 */ 1445 break; 1446 } 1447 1448 /* If it's predicated, it (probably) didn't populate all 1449 * the channels. We might be able to rewrite everything 1450 * that writes that reg, but it would require smarter 1451 * tracking to delay the rewriting until complete success. 1452 */ 1453 if (scan_inst->predicated) 1454 break; 1455 1456 /* If it's half of register setup and not the same half as 1457 * our MOV we're trying to remove, bail for now. 1458 */ 1459 if (scan_inst->force_uncompressed != inst->force_uncompressed || 1460 scan_inst->force_sechalf != inst->force_sechalf) { 1461 break; 1462 } 1463 1464 /* SEND instructions can't have MRF as a destination. */ 1465 if (scan_inst->mlen) 1466 break; 1467 1468 if (intel->gen >= 6) { 1469 /* gen6 math instructions must have the destination be 1470 * GRF, so no compute-to-MRF for them. 1471 */ 1472 if (scan_inst->is_math()) { 1473 break; 1474 } 1475 } 1476 1477 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 1478 /* Found the creator of our MRF's source value. */ 1479 scan_inst->dst.file = MRF; 1480 scan_inst->dst.reg = inst->dst.reg; 1481 scan_inst->saturate |= inst->saturate; 1482 inst->remove(); 1483 progress = true; 1484 } 1485 break; 1486 } 1487 1488 /* We don't handle flow control here. Most computation of 1489 * values that end up in MRFs are shortly before the MRF 1490 * write anyway. 1491 */ 1492 if (scan_inst->opcode == BRW_OPCODE_DO || 1493 scan_inst->opcode == BRW_OPCODE_WHILE || 1494 scan_inst->opcode == BRW_OPCODE_ELSE || 1495 scan_inst->opcode == BRW_OPCODE_ENDIF) { 1496 break; 1497 } 1498 1499 /* You can't read from an MRF, so if someone else reads our 1500 * MRF's source GRF that we wanted to rewrite, that stops us. 1501 */ 1502 bool interfered = false; 1503 for (int i = 0; i < 3; i++) { 1504 if (scan_inst->src[i].file == GRF && 1505 scan_inst->src[i].reg == inst->src[0].reg && 1506 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { 1507 interfered = true; 1508 } 1509 } 1510 if (interfered) 1511 break; 1512 1513 if (scan_inst->dst.file == MRF) { 1514 /* If somebody else writes our MRF here, we can't 1515 * compute-to-MRF before that. 1516 */ 1517 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4; 1518 int scan_mrf_high; 1519 1520 if (scan_inst->dst.reg & BRW_MRF_COMPR4) { 1521 scan_mrf_high = scan_mrf_low + 4; 1522 } else if (c->dispatch_width == 16 && 1523 (!scan_inst->force_uncompressed && 1524 !scan_inst->force_sechalf)) { 1525 scan_mrf_high = scan_mrf_low + 1; 1526 } else { 1527 scan_mrf_high = scan_mrf_low; 1528 } 1529 1530 if (mrf_low == scan_mrf_low || 1531 mrf_low == scan_mrf_high || 1532 mrf_high == scan_mrf_low || 1533 mrf_high == scan_mrf_high) { 1534 break; 1535 } 1536 } 1537 1538 if (scan_inst->mlen > 0) { 1539 /* Found a SEND instruction, which means that there are 1540 * live values in MRFs from base_mrf to base_mrf + 1541 * scan_inst->mlen - 1. Don't go pushing our MRF write up 1542 * above it. 1543 */ 1544 if (mrf_low >= scan_inst->base_mrf && 1545 mrf_low < scan_inst->base_mrf + scan_inst->mlen) { 1546 break; 1547 } 1548 if (mrf_high >= scan_inst->base_mrf && 1549 mrf_high < scan_inst->base_mrf + scan_inst->mlen) { 1550 break; 1551 } 1552 } 1553 } 1554 } 1555 1556 return progress; 1557} 1558 1559/** 1560 * Walks through basic blocks, locking for repeated MRF writes and 1561 * removing the later ones. 1562 */ 1563bool 1564fs_visitor::remove_duplicate_mrf_writes() 1565{ 1566 fs_inst *last_mrf_move[16]; 1567 bool progress = false; 1568 1569 /* Need to update the MRF tracking for compressed instructions. */ 1570 if (c->dispatch_width == 16) 1571 return false; 1572 1573 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 1574 1575 foreach_list_safe(node, &this->instructions) { 1576 fs_inst *inst = (fs_inst *)node; 1577 1578 switch (inst->opcode) { 1579 case BRW_OPCODE_DO: 1580 case BRW_OPCODE_WHILE: 1581 case BRW_OPCODE_IF: 1582 case BRW_OPCODE_ELSE: 1583 case BRW_OPCODE_ENDIF: 1584 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 1585 continue; 1586 default: 1587 break; 1588 } 1589 1590 if (inst->opcode == BRW_OPCODE_MOV && 1591 inst->dst.file == MRF) { 1592 fs_inst *prev_inst = last_mrf_move[inst->dst.reg]; 1593 if (prev_inst && inst->equals(prev_inst)) { 1594 inst->remove(); 1595 progress = true; 1596 continue; 1597 } 1598 } 1599 1600 /* Clear out the last-write records for MRFs that were overwritten. */ 1601 if (inst->dst.file == MRF) { 1602 last_mrf_move[inst->dst.reg] = NULL; 1603 } 1604 1605 if (inst->mlen > 0) { 1606 /* Found a SEND instruction, which will include two or fewer 1607 * implied MRF writes. We could do better here. 1608 */ 1609 for (int i = 0; i < implied_mrf_writes(inst); i++) { 1610 last_mrf_move[inst->base_mrf + i] = NULL; 1611 } 1612 } 1613 1614 /* Clear out any MRF move records whose sources got overwritten. */ 1615 if (inst->dst.file == GRF) { 1616 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) { 1617 if (last_mrf_move[i] && 1618 last_mrf_move[i]->src[0].reg == inst->dst.reg) { 1619 last_mrf_move[i] = NULL; 1620 } 1621 } 1622 } 1623 1624 if (inst->opcode == BRW_OPCODE_MOV && 1625 inst->dst.file == MRF && 1626 inst->src[0].file == GRF && 1627 !inst->predicated) { 1628 last_mrf_move[inst->dst.reg] = inst; 1629 } 1630 } 1631 1632 return progress; 1633} 1634 1635bool 1636fs_visitor::virtual_grf_interferes(int a, int b) 1637{ 1638 int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]); 1639 int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]); 1640 1641 /* We can't handle dead register writes here, without iterating 1642 * over the whole instruction stream to find every single dead 1643 * write to that register to compare to the live interval of the 1644 * other register. Just assert that dead_code_eliminate() has been 1645 * called. 1646 */ 1647 assert((this->virtual_grf_use[a] != -1 || 1648 this->virtual_grf_def[a] == MAX_INSTRUCTION) && 1649 (this->virtual_grf_use[b] != -1 || 1650 this->virtual_grf_def[b] == MAX_INSTRUCTION)); 1651 1652 /* If the register is used to store 16 values of less than float 1653 * size (only the case for pixel_[xy]), then we can't allocate 1654 * another dword-sized thing to that register that would be used in 1655 * the same instruction. This is because when the GPU decodes (for 1656 * example): 1657 * 1658 * (declare (in ) vec4 gl_FragCoord@0x97766a0) 1659 * add(16) g6<1>F g6<8,8,1>UW 0.5F { align1 compr }; 1660 * 1661 * it's actually processed as: 1662 * add(8) g6<1>F g6<8,8,1>UW 0.5F { align1 }; 1663 * add(8) g7<1>F g6.8<8,8,1>UW 0.5F { align1 sechalf }; 1664 * 1665 * so our second half values in g6 got overwritten in the first 1666 * half. 1667 */ 1668 if (c->dispatch_width == 16 && (this->pixel_x.reg == a || 1669 this->pixel_x.reg == b || 1670 this->pixel_y.reg == a || 1671 this->pixel_y.reg == b)) { 1672 return start <= end; 1673 } 1674 1675 return start < end; 1676} 1677 1678bool 1679fs_visitor::run() 1680{ 1681 uint32_t prog_offset_16 = 0; 1682 uint32_t orig_nr_params = c->prog_data.nr_params; 1683 1684 brw_wm_payload_setup(brw, c); 1685 1686 if (c->dispatch_width == 16) { 1687 /* align to 64 byte boundary. */ 1688 while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) { 1689 brw_NOP(p); 1690 } 1691 1692 /* Save off the start of this 16-wide program in case we succeed. */ 1693 prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction); 1694 1695 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 1696 } 1697 1698 if (0) { 1699 emit_dummy_fs(); 1700 } else { 1701 calculate_urb_setup(); 1702 if (intel->gen < 6) 1703 emit_interpolation_setup_gen4(); 1704 else 1705 emit_interpolation_setup_gen6(); 1706 1707 /* Generate FS IR for main(). (the visitor only descends into 1708 * functions called "main"). 1709 */ 1710 foreach_list(node, &*shader->ir) { 1711 ir_instruction *ir = (ir_instruction *)node; 1712 base_ir = ir; 1713 this->result = reg_undef; 1714 ir->accept(this); 1715 } 1716 if (failed) 1717 return false; 1718 1719 emit_fb_writes(); 1720 1721 split_virtual_grfs(); 1722 1723 setup_paramvalues_refs(); 1724 setup_pull_constants(); 1725 1726 bool progress; 1727 do { 1728 progress = false; 1729 1730 progress = remove_duplicate_mrf_writes() || progress; 1731 1732 progress = propagate_constants() || progress; 1733 progress = opt_algebraic() || progress; 1734 progress = register_coalesce() || progress; 1735 progress = compute_to_mrf() || progress; 1736 progress = dead_code_eliminate() || progress; 1737 } while (progress); 1738 1739 remove_dead_constants(); 1740 1741 schedule_instructions(); 1742 1743 assign_curb_setup(); 1744 assign_urb_setup(); 1745 1746 if (0) { 1747 /* Debug of register spilling: Go spill everything. */ 1748 int virtual_grf_count = virtual_grf_next; 1749 for (int i = 0; i < virtual_grf_count; i++) { 1750 spill_reg(i); 1751 } 1752 } 1753 1754 if (0) 1755 assign_regs_trivial(); 1756 else { 1757 while (!assign_regs()) { 1758 if (failed) 1759 break; 1760 } 1761 } 1762 } 1763 assert(force_uncompressed_stack == 0); 1764 assert(force_sechalf_stack == 0); 1765 1766 if (failed) 1767 return false; 1768 1769 generate_code(); 1770 1771 if (c->dispatch_width == 8) { 1772 c->prog_data.reg_blocks = brw_register_blocks(grf_used); 1773 } else { 1774 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used); 1775 c->prog_data.prog_offset_16 = prog_offset_16; 1776 1777 /* Make sure we didn't try to sneak in an extra uniform */ 1778 assert(orig_nr_params == c->prog_data.nr_params); 1779 (void) orig_nr_params; 1780 } 1781 1782 return !failed; 1783} 1784 1785bool 1786brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c, 1787 struct gl_shader_program *prog) 1788{ 1789 struct intel_context *intel = &brw->intel; 1790 1791 if (!prog) 1792 return false; 1793 1794 struct brw_shader *shader = 1795 (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; 1796 if (!shader) 1797 return false; 1798 1799 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 1800 printf("GLSL IR for native fragment shader %d:\n", prog->Name); 1801 _mesa_print_ir(shader->ir, NULL); 1802 printf("\n\n"); 1803 } 1804 1805 /* Now the main event: Visit the shader IR and generate our FS IR for it. 1806 */ 1807 c->dispatch_width = 8; 1808 1809 fs_visitor v(c, prog, shader); 1810 if (!v.run()) { 1811 prog->LinkStatus = false; 1812 ralloc_strcat(&prog->InfoLog, v.fail_msg); 1813 1814 return false; 1815 } 1816 1817 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) { 1818 c->dispatch_width = 16; 1819 fs_visitor v2(c, prog, shader); 1820 v2.import_uniforms(&v); 1821 v2.run(); 1822 } 1823 1824 c->prog_data.dispatch_width = 8; 1825 1826 return true; 1827} 1828 1829bool 1830brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog) 1831{ 1832 struct brw_context *brw = brw_context(ctx); 1833 struct brw_wm_prog_key key; 1834 1835 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT]) 1836 return true; 1837 1838 struct gl_fragment_program *fp = (struct gl_fragment_program *) 1839 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program; 1840 struct brw_fragment_program *bfp = brw_fragment_program(fp); 1841 1842 memset(&key, 0, sizeof(key)); 1843 1844 if (fp->UsesKill) 1845 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT; 1846 1847 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) 1848 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT; 1849 1850 /* Just assume depth testing. */ 1851 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT; 1852 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT; 1853 1854 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS); 1855 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) { 1856 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i))) 1857 continue; 1858 1859 key.proj_attrib_mask |= 1 << i; 1860 1861 int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i); 1862 1863 if (vp_index >= 0) 1864 key.vp_outputs_written |= BITFIELD64_BIT(vp_index); 1865 } 1866 1867 key.clamp_fragment_color = true; 1868 1869 for (int i = 0; i < BRW_MAX_TEX_UNIT; i++) { 1870 if (fp->Base.ShadowSamplers & (1 << i)) 1871 key.compare_funcs[i] = GL_LESS; 1872 1873 /* FINISHME: depth compares might use (0,0,0,W) for example */ 1874 key.tex_swizzles[i] = SWIZZLE_XYZW; 1875 } 1876 1877 if (fp->Base.InputsRead & FRAG_BIT_WPOS) { 1878 key.drawable_height = ctx->DrawBuffer->Height; 1879 key.render_to_fbo = ctx->DrawBuffer->Name != 0; 1880 } 1881 1882 key.nr_color_regions = 1; 1883 1884 key.program_string_id = bfp->id; 1885 1886 uint32_t old_prog_offset = brw->wm.prog_offset; 1887 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data; 1888 1889 bool success = do_wm_prog(brw, prog, bfp, &key); 1890 1891 brw->wm.prog_offset = old_prog_offset; 1892 brw->wm.prog_data = old_prog_data; 1893 1894 return success; 1895} 1896