brw_fs.cpp revision b76378d46a211521582cfab56dc05031a57502a6
1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24/** @file brw_fs.cpp 25 * 26 * This file drives the GLSL IR -> LIR translation, contains the 27 * optimizations on the LIR, and drives the generation of native code 28 * from the LIR. 29 */ 30 31extern "C" { 32 33#include <sys/types.h> 34 35#include "main/macros.h" 36#include "main/shaderobj.h" 37#include "main/uniforms.h" 38#include "program/prog_parameter.h" 39#include "program/prog_print.h" 40#include "program/register_allocate.h" 41#include "program/sampler.h" 42#include "program/hash_table.h" 43#include "brw_context.h" 44#include "brw_eu.h" 45#include "brw_wm.h" 46} 47#include "brw_shader.h" 48#include "brw_fs.h" 49#include "../glsl/glsl_types.h" 50#include "../glsl/ir_print_visitor.h" 51 52#define MAX_INSTRUCTION (1 << 30) 53 54int 55fs_visitor::type_size(const struct glsl_type *type) 56{ 57 unsigned int size, i; 58 59 switch (type->base_type) { 60 case GLSL_TYPE_UINT: 61 case GLSL_TYPE_INT: 62 case GLSL_TYPE_FLOAT: 63 case GLSL_TYPE_BOOL: 64 return type->components(); 65 case GLSL_TYPE_ARRAY: 66 return type_size(type->fields.array) * type->length; 67 case GLSL_TYPE_STRUCT: 68 size = 0; 69 for (i = 0; i < type->length; i++) { 70 size += type_size(type->fields.structure[i].type); 71 } 72 return size; 73 case GLSL_TYPE_SAMPLER: 74 /* Samplers take up no register space, since they're baked in at 75 * link time. 76 */ 77 return 0; 78 default: 79 assert(!"not reached"); 80 return 0; 81 } 82} 83 84void 85fs_visitor::fail(const char *format, ...) 86{ 87 va_list va; 88 char *msg; 89 90 if (failed) 91 return; 92 93 failed = true; 94 95 va_start(va, format); 96 msg = ralloc_vasprintf(mem_ctx, format, va); 97 va_end(va); 98 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg); 99 100 this->fail_msg = msg; 101 102 if (INTEL_DEBUG & DEBUG_WM) { 103 fprintf(stderr, "%s", msg); 104 } 105} 106 107void 108fs_visitor::push_force_uncompressed() 109{ 110 force_uncompressed_stack++; 111} 112 113void 114fs_visitor::pop_force_uncompressed() 115{ 116 force_uncompressed_stack--; 117 assert(force_uncompressed_stack >= 0); 118} 119 120void 121fs_visitor::push_force_sechalf() 122{ 123 force_sechalf_stack++; 124} 125 126void 127fs_visitor::pop_force_sechalf() 128{ 129 force_sechalf_stack--; 130 assert(force_sechalf_stack >= 0); 131} 132 133/** 134 * Returns how many MRFs an FS opcode will write over. 135 * 136 * Note that this is not the 0 or 1 implied writes in an actual gen 137 * instruction -- the FS opcodes often generate MOVs in addition. 138 */ 139int 140fs_visitor::implied_mrf_writes(fs_inst *inst) 141{ 142 if (inst->mlen == 0) 143 return 0; 144 145 switch (inst->opcode) { 146 case FS_OPCODE_RCP: 147 case FS_OPCODE_RSQ: 148 case FS_OPCODE_SQRT: 149 case FS_OPCODE_EXP2: 150 case FS_OPCODE_LOG2: 151 case FS_OPCODE_SIN: 152 case FS_OPCODE_COS: 153 return 1 * c->dispatch_width / 8; 154 case FS_OPCODE_POW: 155 return 2 * c->dispatch_width / 8; 156 case FS_OPCODE_TEX: 157 case FS_OPCODE_TXB: 158 case FS_OPCODE_TXD: 159 case FS_OPCODE_TXL: 160 return 1; 161 case FS_OPCODE_FB_WRITE: 162 return 2; 163 case FS_OPCODE_PULL_CONSTANT_LOAD: 164 case FS_OPCODE_UNSPILL: 165 return 1; 166 case FS_OPCODE_SPILL: 167 return 2; 168 default: 169 assert(!"not reached"); 170 return inst->mlen; 171 } 172} 173 174int 175fs_visitor::virtual_grf_alloc(int size) 176{ 177 if (virtual_grf_array_size <= virtual_grf_next) { 178 if (virtual_grf_array_size == 0) 179 virtual_grf_array_size = 16; 180 else 181 virtual_grf_array_size *= 2; 182 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int, 183 virtual_grf_array_size); 184 } 185 virtual_grf_sizes[virtual_grf_next] = size; 186 return virtual_grf_next++; 187} 188 189/** Fixed HW reg constructor. */ 190fs_reg::fs_reg(enum register_file file, int hw_reg) 191{ 192 init(); 193 this->file = file; 194 this->hw_reg = hw_reg; 195 this->type = BRW_REGISTER_TYPE_F; 196} 197 198/** Fixed HW reg constructor. */ 199fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type) 200{ 201 init(); 202 this->file = file; 203 this->hw_reg = hw_reg; 204 this->type = type; 205} 206 207/** Automatic reg constructor. */ 208fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type) 209{ 210 init(); 211 212 this->file = GRF; 213 this->reg = v->virtual_grf_alloc(v->type_size(type)); 214 this->reg_offset = 0; 215 this->type = brw_type_for_base_type(type); 216} 217 218fs_reg * 219fs_visitor::variable_storage(ir_variable *var) 220{ 221 return (fs_reg *)hash_table_find(this->variable_ht, var); 222} 223 224void 225import_uniforms_callback(const void *key, 226 void *data, 227 void *closure) 228{ 229 struct hash_table *dst_ht = (struct hash_table *)closure; 230 const fs_reg *reg = (const fs_reg *)data; 231 232 if (reg->file != UNIFORM) 233 return; 234 235 hash_table_insert(dst_ht, data, key); 236} 237 238/* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch. 239 * This brings in those uniform definitions 240 */ 241void 242fs_visitor::import_uniforms(fs_visitor *v) 243{ 244 hash_table_call_foreach(v->variable_ht, 245 import_uniforms_callback, 246 variable_ht); 247 this->params_remap = v->params_remap; 248} 249 250/* Our support for uniforms is piggy-backed on the struct 251 * gl_fragment_program, because that's where the values actually 252 * get stored, rather than in some global gl_shader_program uniform 253 * store. 254 */ 255int 256fs_visitor::setup_uniform_values(int loc, const glsl_type *type) 257{ 258 unsigned int offset = 0; 259 260 if (type->is_matrix()) { 261 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT, 262 type->vector_elements, 263 1); 264 265 for (unsigned int i = 0; i < type->matrix_columns; i++) { 266 offset += setup_uniform_values(loc + offset, column); 267 } 268 269 return offset; 270 } 271 272 switch (type->base_type) { 273 case GLSL_TYPE_FLOAT: 274 case GLSL_TYPE_UINT: 275 case GLSL_TYPE_INT: 276 case GLSL_TYPE_BOOL: 277 for (unsigned int i = 0; i < type->vector_elements; i++) { 278 unsigned int param = c->prog_data.nr_params++; 279 280 assert(param < ARRAY_SIZE(c->prog_data.param)); 281 282 switch (type->base_type) { 283 case GLSL_TYPE_FLOAT: 284 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 285 break; 286 case GLSL_TYPE_UINT: 287 c->prog_data.param_convert[param] = PARAM_CONVERT_F2U; 288 break; 289 case GLSL_TYPE_INT: 290 c->prog_data.param_convert[param] = PARAM_CONVERT_F2I; 291 break; 292 case GLSL_TYPE_BOOL: 293 c->prog_data.param_convert[param] = PARAM_CONVERT_F2B; 294 break; 295 default: 296 assert(!"not reached"); 297 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 298 break; 299 } 300 this->param_index[param] = loc; 301 this->param_offset[param] = i; 302 } 303 return 1; 304 305 case GLSL_TYPE_STRUCT: 306 for (unsigned int i = 0; i < type->length; i++) { 307 offset += setup_uniform_values(loc + offset, 308 type->fields.structure[i].type); 309 } 310 return offset; 311 312 case GLSL_TYPE_ARRAY: 313 for (unsigned int i = 0; i < type->length; i++) { 314 offset += setup_uniform_values(loc + offset, type->fields.array); 315 } 316 return offset; 317 318 case GLSL_TYPE_SAMPLER: 319 /* The sampler takes up a slot, but we don't use any values from it. */ 320 return 1; 321 322 default: 323 assert(!"not reached"); 324 return 0; 325 } 326} 327 328 329/* Our support for builtin uniforms is even scarier than non-builtin. 330 * It sits on top of the PROG_STATE_VAR parameters that are 331 * automatically updated from GL context state. 332 */ 333void 334fs_visitor::setup_builtin_uniform_values(ir_variable *ir) 335{ 336 const ir_state_slot *const slots = ir->state_slots; 337 assert(ir->state_slots != NULL); 338 339 for (unsigned int i = 0; i < ir->num_state_slots; i++) { 340 /* This state reference has already been setup by ir_to_mesa, but we'll 341 * get the same index back here. 342 */ 343 int index = _mesa_add_state_reference(this->fp->Base.Parameters, 344 (gl_state_index *)slots[i].tokens); 345 346 /* Add each of the unique swizzles of the element as a parameter. 347 * This'll end up matching the expected layout of the 348 * array/matrix/structure we're trying to fill in. 349 */ 350 int last_swiz = -1; 351 for (unsigned int j = 0; j < 4; j++) { 352 int swiz = GET_SWZ(slots[i].swizzle, j); 353 if (swiz == last_swiz) 354 break; 355 last_swiz = swiz; 356 357 c->prog_data.param_convert[c->prog_data.nr_params] = 358 PARAM_NO_CONVERT; 359 this->param_index[c->prog_data.nr_params] = index; 360 this->param_offset[c->prog_data.nr_params] = swiz; 361 c->prog_data.nr_params++; 362 } 363 } 364} 365 366fs_reg * 367fs_visitor::emit_fragcoord_interpolation(ir_variable *ir) 368{ 369 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 370 fs_reg wpos = *reg; 371 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo; 372 373 /* gl_FragCoord.x */ 374 if (ir->pixel_center_integer) { 375 emit(BRW_OPCODE_MOV, wpos, this->pixel_x); 376 } else { 377 emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f)); 378 } 379 wpos.reg_offset++; 380 381 /* gl_FragCoord.y */ 382 if (!flip && ir->pixel_center_integer) { 383 emit(BRW_OPCODE_MOV, wpos, this->pixel_y); 384 } else { 385 fs_reg pixel_y = this->pixel_y; 386 float offset = (ir->pixel_center_integer ? 0.0 : 0.5); 387 388 if (flip) { 389 pixel_y.negate = true; 390 offset += c->key.drawable_height - 1.0; 391 } 392 393 emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset)); 394 } 395 wpos.reg_offset++; 396 397 /* gl_FragCoord.z */ 398 if (intel->gen >= 6) { 399 emit(BRW_OPCODE_MOV, wpos, 400 fs_reg(brw_vec8_grf(c->source_depth_reg, 0))); 401 } else { 402 emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y, 403 interp_reg(FRAG_ATTRIB_WPOS, 2)); 404 } 405 wpos.reg_offset++; 406 407 /* gl_FragCoord.w: Already set up in emit_interpolation */ 408 emit(BRW_OPCODE_MOV, wpos, this->wpos_w); 409 410 return reg; 411} 412 413fs_reg * 414fs_visitor::emit_general_interpolation(ir_variable *ir) 415{ 416 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 417 /* Interpolation is always in floating point regs. */ 418 reg->type = BRW_REGISTER_TYPE_F; 419 fs_reg attr = *reg; 420 421 unsigned int array_elements; 422 const glsl_type *type; 423 424 if (ir->type->is_array()) { 425 array_elements = ir->type->length; 426 if (array_elements == 0) { 427 fail("dereferenced array '%s' has length 0\n", ir->name); 428 } 429 type = ir->type->fields.array; 430 } else { 431 array_elements = 1; 432 type = ir->type; 433 } 434 435 int location = ir->location; 436 for (unsigned int i = 0; i < array_elements; i++) { 437 for (unsigned int j = 0; j < type->matrix_columns; j++) { 438 if (urb_setup[location] == -1) { 439 /* If there's no incoming setup data for this slot, don't 440 * emit interpolation for it. 441 */ 442 attr.reg_offset += type->vector_elements; 443 location++; 444 continue; 445 } 446 447 bool is_gl_Color = 448 location == FRAG_ATTRIB_COL0 || location == FRAG_ATTRIB_COL1; 449 450 if (c->key.flat_shade && is_gl_Color) { 451 /* Constant interpolation (flat shading) case. The SF has 452 * handed us defined values in only the constant offset 453 * field of the setup reg. 454 */ 455 for (unsigned int k = 0; k < type->vector_elements; k++) { 456 struct brw_reg interp = interp_reg(location, k); 457 interp = suboffset(interp, 3); 458 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp)); 459 attr.reg_offset++; 460 } 461 } else { 462 /* Perspective interpolation case. */ 463 for (unsigned int k = 0; k < type->vector_elements; k++) { 464 /* FINISHME: At some point we probably want to push 465 * this farther by giving similar treatment to the 466 * other potentially constant components of the 467 * attribute, as well as making brw_vs_constval.c 468 * handle varyings other than gl_TexCoord. 469 */ 470 if (location >= FRAG_ATTRIB_TEX0 && 471 location <= FRAG_ATTRIB_TEX7 && 472 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) { 473 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f)); 474 } else { 475 struct brw_reg interp = interp_reg(location, k); 476 emit(FS_OPCODE_LINTERP, attr, 477 this->delta_x, this->delta_y, fs_reg(interp)); 478 } 479 attr.reg_offset++; 480 } 481 482 if (intel->gen < 6) { 483 attr.reg_offset -= type->vector_elements; 484 for (unsigned int k = 0; k < type->vector_elements; k++) { 485 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w); 486 attr.reg_offset++; 487 } 488 } 489 } 490 location++; 491 } 492 } 493 494 return reg; 495} 496 497fs_reg * 498fs_visitor::emit_frontfacing_interpolation(ir_variable *ir) 499{ 500 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 501 502 /* The frontfacing comes in as a bit in the thread payload. */ 503 if (intel->gen >= 6) { 504 emit(BRW_OPCODE_ASR, *reg, 505 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)), 506 fs_reg(15)); 507 emit(BRW_OPCODE_NOT, *reg, *reg); 508 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1)); 509 } else { 510 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); 511 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives 512 * us front face 513 */ 514 fs_inst *inst = emit(BRW_OPCODE_CMP, *reg, 515 fs_reg(r1_6ud), 516 fs_reg(1u << 31)); 517 inst->conditional_mod = BRW_CONDITIONAL_L; 518 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u)); 519 } 520 521 return reg; 522} 523 524fs_inst * 525fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src) 526{ 527 switch (opcode) { 528 case FS_OPCODE_RCP: 529 case FS_OPCODE_RSQ: 530 case FS_OPCODE_SQRT: 531 case FS_OPCODE_EXP2: 532 case FS_OPCODE_LOG2: 533 case FS_OPCODE_SIN: 534 case FS_OPCODE_COS: 535 break; 536 default: 537 assert(!"not reached: bad math opcode"); 538 return NULL; 539 } 540 541 /* Can't do hstride == 0 args to gen6 math, so expand it out. We 542 * might be able to do better by doing execsize = 1 math and then 543 * expanding that result out, but we would need to be careful with 544 * masking. 545 * 546 * The hardware ignores source modifiers (negate and abs) on math 547 * instructions, so we also move to a temp to set those up. 548 */ 549 if (intel->gen >= 6 && (src.file == UNIFORM || 550 src.abs || 551 src.negate)) { 552 fs_reg expanded = fs_reg(this, glsl_type::float_type); 553 emit(BRW_OPCODE_MOV, expanded, src); 554 src = expanded; 555 } 556 557 fs_inst *inst = emit(opcode, dst, src); 558 559 if (intel->gen < 6) { 560 inst->base_mrf = 2; 561 inst->mlen = c->dispatch_width / 8; 562 } 563 564 return inst; 565} 566 567fs_inst * 568fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1) 569{ 570 int base_mrf = 2; 571 fs_inst *inst; 572 573 assert(opcode == FS_OPCODE_POW); 574 575 if (intel->gen >= 6) { 576 /* Can't do hstride == 0 args to gen6 math, so expand it out. 577 * 578 * The hardware ignores source modifiers (negate and abs) on math 579 * instructions, so we also move to a temp to set those up. 580 */ 581 if (src0.file == UNIFORM || src0.abs || src0.negate) { 582 fs_reg expanded = fs_reg(this, glsl_type::float_type); 583 emit(BRW_OPCODE_MOV, expanded, src0); 584 src0 = expanded; 585 } 586 587 if (src1.file == UNIFORM || src1.abs || src1.negate) { 588 fs_reg expanded = fs_reg(this, glsl_type::float_type); 589 emit(BRW_OPCODE_MOV, expanded, src1); 590 src1 = expanded; 591 } 592 593 inst = emit(opcode, dst, src0, src1); 594 } else { 595 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1); 596 inst = emit(opcode, dst, src0, reg_null_f); 597 598 inst->base_mrf = base_mrf; 599 inst->mlen = 2 * c->dispatch_width / 8; 600 } 601 return inst; 602} 603 604/** 605 * To be called after the last _mesa_add_state_reference() call, to 606 * set up prog_data.param[] for assign_curb_setup() and 607 * setup_pull_constants(). 608 */ 609void 610fs_visitor::setup_paramvalues_refs() 611{ 612 if (c->dispatch_width != 8) 613 return; 614 615 /* Set up the pointers to ParamValues now that that array is finalized. */ 616 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 617 c->prog_data.param[i] = 618 (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] + 619 this->param_offset[i]; 620 } 621} 622 623void 624fs_visitor::assign_curb_setup() 625{ 626 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8; 627 if (c->dispatch_width == 8) { 628 c->prog_data.first_curbe_grf = c->nr_payload_regs; 629 } else { 630 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs; 631 } 632 633 /* Map the offsets in the UNIFORM file to fixed HW regs. */ 634 foreach_list(node, &this->instructions) { 635 fs_inst *inst = (fs_inst *)node; 636 637 for (unsigned int i = 0; i < 3; i++) { 638 if (inst->src[i].file == UNIFORM) { 639 int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 640 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs + 641 constant_nr / 8, 642 constant_nr % 8); 643 644 inst->src[i].file = FIXED_HW_REG; 645 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type); 646 } 647 } 648 } 649} 650 651void 652fs_visitor::calculate_urb_setup() 653{ 654 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 655 urb_setup[i] = -1; 656 } 657 658 int urb_next = 0; 659 /* Figure out where each of the incoming setup attributes lands. */ 660 if (intel->gen >= 6) { 661 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 662 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) { 663 urb_setup[i] = urb_next++; 664 } 665 } 666 } else { 667 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */ 668 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) { 669 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) { 670 int fp_index; 671 672 if (i >= VERT_RESULT_VAR0) 673 fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0); 674 else if (i <= VERT_RESULT_TEX7) 675 fp_index = i; 676 else 677 fp_index = -1; 678 679 if (fp_index >= 0) 680 urb_setup[fp_index] = urb_next++; 681 } 682 } 683 } 684 685 /* Each attribute is 4 setup channels, each of which is half a reg. */ 686 c->prog_data.urb_read_length = urb_next * 2; 687} 688 689void 690fs_visitor::assign_urb_setup() 691{ 692 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length; 693 694 /* Offset all the urb_setup[] index by the actual position of the 695 * setup regs, now that the location of the constants has been chosen. 696 */ 697 foreach_list(node, &this->instructions) { 698 fs_inst *inst = (fs_inst *)node; 699 700 if (inst->opcode == FS_OPCODE_LINTERP) { 701 assert(inst->src[2].file == FIXED_HW_REG); 702 inst->src[2].fixed_hw_reg.nr += urb_start; 703 } 704 705 if (inst->opcode == FS_OPCODE_CINTERP) { 706 assert(inst->src[0].file == FIXED_HW_REG); 707 inst->src[0].fixed_hw_reg.nr += urb_start; 708 } 709 } 710 711 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length; 712} 713 714/** 715 * Split large virtual GRFs into separate components if we can. 716 * 717 * This is mostly duplicated with what brw_fs_vector_splitting does, 718 * but that's really conservative because it's afraid of doing 719 * splitting that doesn't result in real progress after the rest of 720 * the optimization phases, which would cause infinite looping in 721 * optimization. We can do it once here, safely. This also has the 722 * opportunity to split interpolated values, or maybe even uniforms, 723 * which we don't have at the IR level. 724 * 725 * We want to split, because virtual GRFs are what we register 726 * allocate and spill (due to contiguousness requirements for some 727 * instructions), and they're what we naturally generate in the 728 * codegen process, but most virtual GRFs don't actually need to be 729 * contiguous sets of GRFs. If we split, we'll end up with reduced 730 * live intervals and better dead code elimination and coalescing. 731 */ 732void 733fs_visitor::split_virtual_grfs() 734{ 735 int num_vars = this->virtual_grf_next; 736 bool split_grf[num_vars]; 737 int new_virtual_grf[num_vars]; 738 739 /* Try to split anything > 0 sized. */ 740 for (int i = 0; i < num_vars; i++) { 741 if (this->virtual_grf_sizes[i] != 1) 742 split_grf[i] = true; 743 else 744 split_grf[i] = false; 745 } 746 747 if (brw->has_pln) { 748 /* PLN opcodes rely on the delta_xy being contiguous. */ 749 split_grf[this->delta_x.reg] = false; 750 } 751 752 foreach_list(node, &this->instructions) { 753 fs_inst *inst = (fs_inst *)node; 754 755 /* Texturing produces 4 contiguous registers, so no splitting. */ 756 if (inst->is_tex()) { 757 split_grf[inst->dst.reg] = false; 758 } 759 } 760 761 /* Allocate new space for split regs. Note that the virtual 762 * numbers will be contiguous. 763 */ 764 for (int i = 0; i < num_vars; i++) { 765 if (split_grf[i]) { 766 new_virtual_grf[i] = virtual_grf_alloc(1); 767 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) { 768 int reg = virtual_grf_alloc(1); 769 assert(reg == new_virtual_grf[i] + j - 1); 770 (void) reg; 771 } 772 this->virtual_grf_sizes[i] = 1; 773 } 774 } 775 776 foreach_list(node, &this->instructions) { 777 fs_inst *inst = (fs_inst *)node; 778 779 if (inst->dst.file == GRF && 780 split_grf[inst->dst.reg] && 781 inst->dst.reg_offset != 0) { 782 inst->dst.reg = (new_virtual_grf[inst->dst.reg] + 783 inst->dst.reg_offset - 1); 784 inst->dst.reg_offset = 0; 785 } 786 for (int i = 0; i < 3; i++) { 787 if (inst->src[i].file == GRF && 788 split_grf[inst->src[i].reg] && 789 inst->src[i].reg_offset != 0) { 790 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] + 791 inst->src[i].reg_offset - 1); 792 inst->src[i].reg_offset = 0; 793 } 794 } 795 } 796 this->live_intervals_valid = false; 797} 798 799bool 800fs_visitor::remove_dead_constants() 801{ 802 if (c->dispatch_width == 8) { 803 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params); 804 805 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) 806 this->params_remap[i] = -1; 807 808 /* Find which params are still in use. */ 809 foreach_list(node, &this->instructions) { 810 fs_inst *inst = (fs_inst *)node; 811 812 for (int i = 0; i < 3; i++) { 813 int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 814 815 if (inst->src[i].file != UNIFORM) 816 continue; 817 818 assert(constant_nr < (int)c->prog_data.nr_params); 819 820 /* For now, set this to non-negative. We'll give it the 821 * actual new number in a moment, in order to keep the 822 * register numbers nicely ordered. 823 */ 824 this->params_remap[constant_nr] = 0; 825 } 826 } 827 828 /* Figure out what the new numbers for the params will be. At some 829 * point when we're doing uniform array access, we're going to want 830 * to keep the distinction between .reg and .reg_offset, but for 831 * now we don't care. 832 */ 833 unsigned int new_nr_params = 0; 834 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 835 if (this->params_remap[i] != -1) { 836 this->params_remap[i] = new_nr_params++; 837 } 838 } 839 840 /* Update the list of params to be uploaded to match our new numbering. */ 841 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 842 int remapped = this->params_remap[i]; 843 844 if (remapped == -1) 845 continue; 846 847 /* We've already done setup_paramvalues_refs() so no need to worry 848 * about param_index and param_offset. 849 */ 850 c->prog_data.param[remapped] = c->prog_data.param[i]; 851 c->prog_data.param_convert[remapped] = c->prog_data.param_convert[i]; 852 } 853 854 c->prog_data.nr_params = new_nr_params; 855 } else { 856 /* This should have been generated in the 8-wide pass already. */ 857 assert(this->params_remap); 858 } 859 860 /* Now do the renumbering of the shader to remove unused params. */ 861 foreach_list(node, &this->instructions) { 862 fs_inst *inst = (fs_inst *)node; 863 864 for (int i = 0; i < 3; i++) { 865 int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 866 867 if (inst->src[i].file != UNIFORM) 868 continue; 869 870 assert(this->params_remap[constant_nr] != -1); 871 inst->src[i].hw_reg = this->params_remap[constant_nr]; 872 inst->src[i].reg_offset = 0; 873 } 874 } 875 876 return true; 877} 878 879/** 880 * Choose accesses from the UNIFORM file to demote to using the pull 881 * constant buffer. 882 * 883 * We allow a fragment shader to have more than the specified minimum 884 * maximum number of fragment shader uniform components (64). If 885 * there are too many of these, they'd fill up all of register space. 886 * So, this will push some of them out to the pull constant buffer and 887 * update the program to load them. 888 */ 889void 890fs_visitor::setup_pull_constants() 891{ 892 /* Only allow 16 registers (128 uniform components) as push constants. */ 893 unsigned int max_uniform_components = 16 * 8; 894 if (c->prog_data.nr_params <= max_uniform_components) 895 return; 896 897 if (c->dispatch_width == 16) { 898 fail("Pull constants not supported in 16-wide\n"); 899 return; 900 } 901 902 /* Just demote the end of the list. We could probably do better 903 * here, demoting things that are rarely used in the program first. 904 */ 905 int pull_uniform_base = max_uniform_components; 906 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base; 907 908 foreach_list(node, &this->instructions) { 909 fs_inst *inst = (fs_inst *)node; 910 911 for (int i = 0; i < 3; i++) { 912 if (inst->src[i].file != UNIFORM) 913 continue; 914 915 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 916 if (uniform_nr < pull_uniform_base) 917 continue; 918 919 fs_reg dst = fs_reg(this, glsl_type::float_type); 920 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD, 921 dst); 922 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15; 923 pull->ir = inst->ir; 924 pull->annotation = inst->annotation; 925 pull->base_mrf = 14; 926 pull->mlen = 1; 927 928 inst->insert_before(pull); 929 930 inst->src[i].file = GRF; 931 inst->src[i].reg = dst.reg; 932 inst->src[i].reg_offset = 0; 933 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3; 934 } 935 } 936 937 for (int i = 0; i < pull_uniform_count; i++) { 938 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i]; 939 c->prog_data.pull_param_convert[i] = 940 c->prog_data.param_convert[pull_uniform_base + i]; 941 } 942 c->prog_data.nr_params -= pull_uniform_count; 943 c->prog_data.nr_pull_params = pull_uniform_count; 944} 945 946void 947fs_visitor::calculate_live_intervals() 948{ 949 int num_vars = this->virtual_grf_next; 950 int *def = ralloc_array(mem_ctx, int, num_vars); 951 int *use = ralloc_array(mem_ctx, int, num_vars); 952 int loop_depth = 0; 953 int loop_start = 0; 954 955 if (this->live_intervals_valid) 956 return; 957 958 for (int i = 0; i < num_vars; i++) { 959 def[i] = MAX_INSTRUCTION; 960 use[i] = -1; 961 } 962 963 int ip = 0; 964 foreach_list(node, &this->instructions) { 965 fs_inst *inst = (fs_inst *)node; 966 967 if (inst->opcode == BRW_OPCODE_DO) { 968 if (loop_depth++ == 0) 969 loop_start = ip; 970 } else if (inst->opcode == BRW_OPCODE_WHILE) { 971 loop_depth--; 972 973 if (loop_depth == 0) { 974 /* Patches up the use of vars marked for being live across 975 * the whole loop. 976 */ 977 for (int i = 0; i < num_vars; i++) { 978 if (use[i] == loop_start) { 979 use[i] = ip; 980 } 981 } 982 } 983 } else { 984 for (unsigned int i = 0; i < 3; i++) { 985 if (inst->src[i].file == GRF) { 986 int reg = inst->src[i].reg; 987 988 if (!loop_depth) { 989 use[reg] = ip; 990 } else { 991 def[reg] = MIN2(loop_start, def[reg]); 992 use[reg] = loop_start; 993 994 /* Nobody else is going to go smash our start to 995 * later in the loop now, because def[reg] now 996 * points before the bb header. 997 */ 998 } 999 } 1000 } 1001 if (inst->dst.file == GRF) { 1002 int reg = inst->dst.reg; 1003 1004 if (!loop_depth) { 1005 def[reg] = MIN2(def[reg], ip); 1006 } else { 1007 def[reg] = MIN2(def[reg], loop_start); 1008 } 1009 } 1010 } 1011 1012 ip++; 1013 } 1014 1015 ralloc_free(this->virtual_grf_def); 1016 ralloc_free(this->virtual_grf_use); 1017 this->virtual_grf_def = def; 1018 this->virtual_grf_use = use; 1019 1020 this->live_intervals_valid = true; 1021} 1022 1023/** 1024 * Attempts to move immediate constants into the immediate 1025 * constant slot of following instructions. 1026 * 1027 * Immediate constants are a bit tricky -- they have to be in the last 1028 * operand slot, you can't do abs/negate on them, 1029 */ 1030 1031bool 1032fs_visitor::propagate_constants() 1033{ 1034 bool progress = false; 1035 1036 calculate_live_intervals(); 1037 1038 foreach_list(node, &this->instructions) { 1039 fs_inst *inst = (fs_inst *)node; 1040 1041 if (inst->opcode != BRW_OPCODE_MOV || 1042 inst->predicated || 1043 inst->dst.file != GRF || inst->src[0].file != IMM || 1044 inst->dst.type != inst->src[0].type || 1045 (c->dispatch_width == 16 && 1046 (inst->force_uncompressed || inst->force_sechalf))) 1047 continue; 1048 1049 /* Don't bother with cases where we should have had the 1050 * operation on the constant folded in GLSL already. 1051 */ 1052 if (inst->saturate) 1053 continue; 1054 1055 /* Found a move of a constant to a GRF. Find anything else using the GRF 1056 * before it's written, and replace it with the constant if we can. 1057 */ 1058 for (fs_inst *scan_inst = (fs_inst *)inst->next; 1059 !scan_inst->is_tail_sentinel(); 1060 scan_inst = (fs_inst *)scan_inst->next) { 1061 if (scan_inst->opcode == BRW_OPCODE_DO || 1062 scan_inst->opcode == BRW_OPCODE_WHILE || 1063 scan_inst->opcode == BRW_OPCODE_ELSE || 1064 scan_inst->opcode == BRW_OPCODE_ENDIF) { 1065 break; 1066 } 1067 1068 for (int i = 2; i >= 0; i--) { 1069 if (scan_inst->src[i].file != GRF || 1070 scan_inst->src[i].reg != inst->dst.reg || 1071 scan_inst->src[i].reg_offset != inst->dst.reg_offset) 1072 continue; 1073 1074 /* Don't bother with cases where we should have had the 1075 * operation on the constant folded in GLSL already. 1076 */ 1077 if (scan_inst->src[i].negate || scan_inst->src[i].abs) 1078 continue; 1079 1080 switch (scan_inst->opcode) { 1081 case BRW_OPCODE_MOV: 1082 scan_inst->src[i] = inst->src[0]; 1083 progress = true; 1084 break; 1085 1086 case BRW_OPCODE_MUL: 1087 case BRW_OPCODE_ADD: 1088 if (i == 1) { 1089 scan_inst->src[i] = inst->src[0]; 1090 progress = true; 1091 } else if (i == 0 && scan_inst->src[1].file != IMM) { 1092 /* Fit this constant in by commuting the operands */ 1093 scan_inst->src[0] = scan_inst->src[1]; 1094 scan_inst->src[1] = inst->src[0]; 1095 progress = true; 1096 } 1097 break; 1098 1099 case BRW_OPCODE_CMP: 1100 if (i == 1) { 1101 scan_inst->src[i] = inst->src[0]; 1102 progress = true; 1103 } else if (i == 0 && scan_inst->src[1].file != IMM) { 1104 uint32_t new_cmod; 1105 1106 new_cmod = brw_swap_cmod(scan_inst->conditional_mod); 1107 if (new_cmod != ~0u) { 1108 /* Fit this constant in by swapping the operands and 1109 * flipping the test 1110 */ 1111 scan_inst->src[0] = scan_inst->src[1]; 1112 scan_inst->src[1] = inst->src[0]; 1113 scan_inst->conditional_mod = new_cmod; 1114 progress = true; 1115 } 1116 } 1117 break; 1118 1119 case BRW_OPCODE_SEL: 1120 if (i == 1) { 1121 scan_inst->src[i] = inst->src[0]; 1122 progress = true; 1123 } else if (i == 0 && scan_inst->src[1].file != IMM) { 1124 scan_inst->src[0] = scan_inst->src[1]; 1125 scan_inst->src[1] = inst->src[0]; 1126 1127 /* If this was predicated, flipping operands means 1128 * we also need to flip the predicate. 1129 */ 1130 if (scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) { 1131 scan_inst->predicate_inverse = 1132 !scan_inst->predicate_inverse; 1133 } 1134 progress = true; 1135 } 1136 break; 1137 1138 case FS_OPCODE_RCP: 1139 /* The hardware doesn't do math on immediate values 1140 * (because why are you doing that, seriously?), but 1141 * the correct answer is to just constant fold it 1142 * anyway. 1143 */ 1144 assert(i == 0); 1145 if (inst->src[0].imm.f != 0.0f) { 1146 scan_inst->opcode = BRW_OPCODE_MOV; 1147 scan_inst->src[0] = inst->src[0]; 1148 scan_inst->src[0].imm.f = 1.0f / scan_inst->src[0].imm.f; 1149 progress = true; 1150 } 1151 break; 1152 } 1153 } 1154 1155 if (scan_inst->dst.file == GRF && 1156 scan_inst->dst.reg == inst->dst.reg && 1157 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 1158 scan_inst->is_tex())) { 1159 break; 1160 } 1161 } 1162 } 1163 1164 if (progress) 1165 this->live_intervals_valid = false; 1166 1167 return progress; 1168} 1169 1170 1171/** 1172 * Attempts to move immediate constants into the immediate 1173 * constant slot of following instructions. 1174 * 1175 * Immediate constants are a bit tricky -- they have to be in the last 1176 * operand slot, you can't do abs/negate on them, 1177 */ 1178 1179bool 1180fs_visitor::opt_algebraic() 1181{ 1182 bool progress = false; 1183 1184 calculate_live_intervals(); 1185 1186 foreach_list(node, &this->instructions) { 1187 fs_inst *inst = (fs_inst *)node; 1188 1189 switch (inst->opcode) { 1190 case BRW_OPCODE_MUL: 1191 if (inst->src[1].file != IMM) 1192 continue; 1193 1194 /* a * 1.0 = a */ 1195 if (inst->src[1].type == BRW_REGISTER_TYPE_F && 1196 inst->src[1].imm.f == 1.0) { 1197 inst->opcode = BRW_OPCODE_MOV; 1198 inst->src[1] = reg_undef; 1199 progress = true; 1200 break; 1201 } 1202 1203 break; 1204 } 1205 } 1206 1207 return progress; 1208} 1209 1210/** 1211 * Must be called after calculate_live_intervales() to remove unused 1212 * writes to registers -- register allocation will fail otherwise 1213 * because something deffed but not used won't be considered to 1214 * interfere with other regs. 1215 */ 1216bool 1217fs_visitor::dead_code_eliminate() 1218{ 1219 bool progress = false; 1220 int pc = 0; 1221 1222 calculate_live_intervals(); 1223 1224 foreach_list_safe(node, &this->instructions) { 1225 fs_inst *inst = (fs_inst *)node; 1226 1227 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) { 1228 inst->remove(); 1229 progress = true; 1230 } 1231 1232 pc++; 1233 } 1234 1235 if (progress) 1236 live_intervals_valid = false; 1237 1238 return progress; 1239} 1240 1241bool 1242fs_visitor::register_coalesce() 1243{ 1244 bool progress = false; 1245 int if_depth = 0; 1246 int loop_depth = 0; 1247 1248 foreach_list_safe(node, &this->instructions) { 1249 fs_inst *inst = (fs_inst *)node; 1250 1251 /* Make sure that we dominate the instructions we're going to 1252 * scan for interfering with our coalescing, or we won't have 1253 * scanned enough to see if anything interferes with our 1254 * coalescing. We don't dominate the following instructions if 1255 * we're in a loop or an if block. 1256 */ 1257 switch (inst->opcode) { 1258 case BRW_OPCODE_DO: 1259 loop_depth++; 1260 break; 1261 case BRW_OPCODE_WHILE: 1262 loop_depth--; 1263 break; 1264 case BRW_OPCODE_IF: 1265 if_depth++; 1266 break; 1267 case BRW_OPCODE_ENDIF: 1268 if_depth--; 1269 break; 1270 } 1271 if (loop_depth || if_depth) 1272 continue; 1273 1274 if (inst->opcode != BRW_OPCODE_MOV || 1275 inst->predicated || 1276 inst->saturate || 1277 inst->dst.file != GRF || (inst->src[0].file != GRF && 1278 inst->src[0].file != UNIFORM)|| 1279 inst->dst.type != inst->src[0].type) 1280 continue; 1281 1282 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate; 1283 1284 /* Found a move of a GRF to a GRF. Let's see if we can coalesce 1285 * them: check for no writes to either one until the exit of the 1286 * program. 1287 */ 1288 bool interfered = false; 1289 1290 for (fs_inst *scan_inst = (fs_inst *)inst->next; 1291 !scan_inst->is_tail_sentinel(); 1292 scan_inst = (fs_inst *)scan_inst->next) { 1293 if (scan_inst->dst.file == GRF) { 1294 if (scan_inst->dst.reg == inst->dst.reg && 1295 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 1296 scan_inst->is_tex())) { 1297 interfered = true; 1298 break; 1299 } 1300 if (inst->src[0].file == GRF && 1301 scan_inst->dst.reg == inst->src[0].reg && 1302 (scan_inst->dst.reg_offset == inst->src[0].reg_offset || 1303 scan_inst->is_tex())) { 1304 interfered = true; 1305 break; 1306 } 1307 } 1308 1309 /* The gen6 MATH instruction can't handle source modifiers or 1310 * unusual register regions, so avoid coalescing those for 1311 * now. We should do something more specific. 1312 */ 1313 if (intel->gen >= 6 && 1314 scan_inst->is_math() && 1315 (has_source_modifiers || inst->src[0].file == UNIFORM)) { 1316 interfered = true; 1317 break; 1318 } 1319 } 1320 if (interfered) { 1321 continue; 1322 } 1323 1324 /* Rewrite the later usage to point at the source of the move to 1325 * be removed. 1326 */ 1327 for (fs_inst *scan_inst = inst; 1328 !scan_inst->is_tail_sentinel(); 1329 scan_inst = (fs_inst *)scan_inst->next) { 1330 for (int i = 0; i < 3; i++) { 1331 if (scan_inst->src[i].file == GRF && 1332 scan_inst->src[i].reg == inst->dst.reg && 1333 scan_inst->src[i].reg_offset == inst->dst.reg_offset) { 1334 fs_reg new_src = inst->src[0]; 1335 new_src.negate ^= scan_inst->src[i].negate; 1336 new_src.abs |= scan_inst->src[i].abs; 1337 scan_inst->src[i] = new_src; 1338 } 1339 } 1340 } 1341 1342 inst->remove(); 1343 progress = true; 1344 } 1345 1346 if (progress) 1347 live_intervals_valid = false; 1348 1349 return progress; 1350} 1351 1352 1353bool 1354fs_visitor::compute_to_mrf() 1355{ 1356 bool progress = false; 1357 int next_ip = 0; 1358 1359 calculate_live_intervals(); 1360 1361 foreach_list_safe(node, &this->instructions) { 1362 fs_inst *inst = (fs_inst *)node; 1363 1364 int ip = next_ip; 1365 next_ip++; 1366 1367 if (inst->opcode != BRW_OPCODE_MOV || 1368 inst->predicated || 1369 inst->dst.file != MRF || inst->src[0].file != GRF || 1370 inst->dst.type != inst->src[0].type || 1371 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1) 1372 continue; 1373 1374 /* Work out which hardware MRF registers are written by this 1375 * instruction. 1376 */ 1377 int mrf_low = inst->dst.hw_reg & ~BRW_MRF_COMPR4; 1378 int mrf_high; 1379 if (inst->dst.hw_reg & BRW_MRF_COMPR4) { 1380 mrf_high = mrf_low + 4; 1381 } else if (c->dispatch_width == 16 && 1382 (!inst->force_uncompressed && !inst->force_sechalf)) { 1383 mrf_high = mrf_low + 1; 1384 } else { 1385 mrf_high = mrf_low; 1386 } 1387 1388 /* Can't compute-to-MRF this GRF if someone else was going to 1389 * read it later. 1390 */ 1391 if (this->virtual_grf_use[inst->src[0].reg] > ip) 1392 continue; 1393 1394 /* Found a move of a GRF to a MRF. Let's see if we can go 1395 * rewrite the thing that made this GRF to write into the MRF. 1396 */ 1397 fs_inst *scan_inst; 1398 for (scan_inst = (fs_inst *)inst->prev; 1399 scan_inst->prev != NULL; 1400 scan_inst = (fs_inst *)scan_inst->prev) { 1401 if (scan_inst->dst.file == GRF && 1402 scan_inst->dst.reg == inst->src[0].reg) { 1403 /* Found the last thing to write our reg we want to turn 1404 * into a compute-to-MRF. 1405 */ 1406 1407 if (scan_inst->is_tex()) { 1408 /* texturing writes several continuous regs, so we can't 1409 * compute-to-mrf that. 1410 */ 1411 break; 1412 } 1413 1414 /* If it's predicated, it (probably) didn't populate all 1415 * the channels. We might be able to rewrite everything 1416 * that writes that reg, but it would require smarter 1417 * tracking to delay the rewriting until complete success. 1418 */ 1419 if (scan_inst->predicated) 1420 break; 1421 1422 /* If it's half of register setup and not the same half as 1423 * our MOV we're trying to remove, bail for now. 1424 */ 1425 if (scan_inst->force_uncompressed != inst->force_uncompressed || 1426 scan_inst->force_sechalf != inst->force_sechalf) { 1427 break; 1428 } 1429 1430 /* SEND instructions can't have MRF as a destination. */ 1431 if (scan_inst->mlen) 1432 break; 1433 1434 if (intel->gen >= 6) { 1435 /* gen6 math instructions must have the destination be 1436 * GRF, so no compute-to-MRF for them. 1437 */ 1438 if (scan_inst->is_math()) { 1439 break; 1440 } 1441 } 1442 1443 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 1444 /* Found the creator of our MRF's source value. */ 1445 scan_inst->dst.file = MRF; 1446 scan_inst->dst.hw_reg = inst->dst.hw_reg; 1447 scan_inst->saturate |= inst->saturate; 1448 inst->remove(); 1449 progress = true; 1450 } 1451 break; 1452 } 1453 1454 /* We don't handle flow control here. Most computation of 1455 * values that end up in MRFs are shortly before the MRF 1456 * write anyway. 1457 */ 1458 if (scan_inst->opcode == BRW_OPCODE_DO || 1459 scan_inst->opcode == BRW_OPCODE_WHILE || 1460 scan_inst->opcode == BRW_OPCODE_ELSE || 1461 scan_inst->opcode == BRW_OPCODE_ENDIF) { 1462 break; 1463 } 1464 1465 /* You can't read from an MRF, so if someone else reads our 1466 * MRF's source GRF that we wanted to rewrite, that stops us. 1467 */ 1468 bool interfered = false; 1469 for (int i = 0; i < 3; i++) { 1470 if (scan_inst->src[i].file == GRF && 1471 scan_inst->src[i].reg == inst->src[0].reg && 1472 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { 1473 interfered = true; 1474 } 1475 } 1476 if (interfered) 1477 break; 1478 1479 if (scan_inst->dst.file == MRF) { 1480 /* If somebody else writes our MRF here, we can't 1481 * compute-to-MRF before that. 1482 */ 1483 int scan_mrf_low = scan_inst->dst.hw_reg & ~BRW_MRF_COMPR4; 1484 int scan_mrf_high; 1485 1486 if (scan_inst->dst.hw_reg & BRW_MRF_COMPR4) { 1487 scan_mrf_high = scan_mrf_low + 4; 1488 } else if (c->dispatch_width == 16 && 1489 (!scan_inst->force_uncompressed && 1490 !scan_inst->force_sechalf)) { 1491 scan_mrf_high = scan_mrf_low + 1; 1492 } else { 1493 scan_mrf_high = scan_mrf_low; 1494 } 1495 1496 if (mrf_low == scan_mrf_low || 1497 mrf_low == scan_mrf_high || 1498 mrf_high == scan_mrf_low || 1499 mrf_high == scan_mrf_high) { 1500 break; 1501 } 1502 } 1503 1504 if (scan_inst->mlen > 0) { 1505 /* Found a SEND instruction, which means that there are 1506 * live values in MRFs from base_mrf to base_mrf + 1507 * scan_inst->mlen - 1. Don't go pushing our MRF write up 1508 * above it. 1509 */ 1510 if (mrf_low >= scan_inst->base_mrf && 1511 mrf_low < scan_inst->base_mrf + scan_inst->mlen) { 1512 break; 1513 } 1514 if (mrf_high >= scan_inst->base_mrf && 1515 mrf_high < scan_inst->base_mrf + scan_inst->mlen) { 1516 break; 1517 } 1518 } 1519 } 1520 } 1521 1522 return progress; 1523} 1524 1525/** 1526 * Walks through basic blocks, locking for repeated MRF writes and 1527 * removing the later ones. 1528 */ 1529bool 1530fs_visitor::remove_duplicate_mrf_writes() 1531{ 1532 fs_inst *last_mrf_move[16]; 1533 bool progress = false; 1534 1535 /* Need to update the MRF tracking for compressed instructions. */ 1536 if (c->dispatch_width == 16) 1537 return false; 1538 1539 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 1540 1541 foreach_list_safe(node, &this->instructions) { 1542 fs_inst *inst = (fs_inst *)node; 1543 1544 switch (inst->opcode) { 1545 case BRW_OPCODE_DO: 1546 case BRW_OPCODE_WHILE: 1547 case BRW_OPCODE_IF: 1548 case BRW_OPCODE_ELSE: 1549 case BRW_OPCODE_ENDIF: 1550 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 1551 continue; 1552 default: 1553 break; 1554 } 1555 1556 if (inst->opcode == BRW_OPCODE_MOV && 1557 inst->dst.file == MRF) { 1558 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg]; 1559 if (prev_inst && inst->equals(prev_inst)) { 1560 inst->remove(); 1561 progress = true; 1562 continue; 1563 } 1564 } 1565 1566 /* Clear out the last-write records for MRFs that were overwritten. */ 1567 if (inst->dst.file == MRF) { 1568 last_mrf_move[inst->dst.hw_reg] = NULL; 1569 } 1570 1571 if (inst->mlen > 0) { 1572 /* Found a SEND instruction, which will include two or fewer 1573 * implied MRF writes. We could do better here. 1574 */ 1575 for (int i = 0; i < implied_mrf_writes(inst); i++) { 1576 last_mrf_move[inst->base_mrf + i] = NULL; 1577 } 1578 } 1579 1580 /* Clear out any MRF move records whose sources got overwritten. */ 1581 if (inst->dst.file == GRF) { 1582 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) { 1583 if (last_mrf_move[i] && 1584 last_mrf_move[i]->src[0].reg == inst->dst.reg) { 1585 last_mrf_move[i] = NULL; 1586 } 1587 } 1588 } 1589 1590 if (inst->opcode == BRW_OPCODE_MOV && 1591 inst->dst.file == MRF && 1592 inst->src[0].file == GRF && 1593 !inst->predicated) { 1594 last_mrf_move[inst->dst.hw_reg] = inst; 1595 } 1596 } 1597 1598 return progress; 1599} 1600 1601bool 1602fs_visitor::virtual_grf_interferes(int a, int b) 1603{ 1604 int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]); 1605 int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]); 1606 1607 /* We can't handle dead register writes here, without iterating 1608 * over the whole instruction stream to find every single dead 1609 * write to that register to compare to the live interval of the 1610 * other register. Just assert that dead_code_eliminate() has been 1611 * called. 1612 */ 1613 assert((this->virtual_grf_use[a] != -1 || 1614 this->virtual_grf_def[a] == MAX_INSTRUCTION) && 1615 (this->virtual_grf_use[b] != -1 || 1616 this->virtual_grf_def[b] == MAX_INSTRUCTION)); 1617 1618 /* If the register is used to store 16 values of less than float 1619 * size (only the case for pixel_[xy]), then we can't allocate 1620 * another dword-sized thing to that register that would be used in 1621 * the same instruction. This is because when the GPU decodes (for 1622 * example): 1623 * 1624 * (declare (in ) vec4 gl_FragCoord@0x97766a0) 1625 * add(16) g6<1>F g6<8,8,1>UW 0.5F { align1 compr }; 1626 * 1627 * it's actually processed as: 1628 * add(8) g6<1>F g6<8,8,1>UW 0.5F { align1 }; 1629 * add(8) g7<1>F g6.8<8,8,1>UW 0.5F { align1 sechalf }; 1630 * 1631 * so our second half values in g6 got overwritten in the first 1632 * half. 1633 */ 1634 if (c->dispatch_width == 16 && (this->pixel_x.reg == a || 1635 this->pixel_x.reg == b || 1636 this->pixel_y.reg == a || 1637 this->pixel_y.reg == b)) { 1638 return start <= end; 1639 } 1640 1641 return start < end; 1642} 1643 1644bool 1645fs_visitor::run() 1646{ 1647 uint32_t prog_offset_16 = 0; 1648 uint32_t orig_nr_params = c->prog_data.nr_params; 1649 1650 brw_wm_payload_setup(brw, c); 1651 1652 if (c->dispatch_width == 16) { 1653 /* align to 64 byte boundary. */ 1654 while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) { 1655 brw_NOP(p); 1656 } 1657 1658 /* Save off the start of this 16-wide program in case we succeed. */ 1659 prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction); 1660 1661 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 1662 } 1663 1664 if (0) { 1665 emit_dummy_fs(); 1666 } else { 1667 calculate_urb_setup(); 1668 if (intel->gen < 6) 1669 emit_interpolation_setup_gen4(); 1670 else 1671 emit_interpolation_setup_gen6(); 1672 1673 /* Generate FS IR for main(). (the visitor only descends into 1674 * functions called "main"). 1675 */ 1676 foreach_list(node, &*shader->ir) { 1677 ir_instruction *ir = (ir_instruction *)node; 1678 base_ir = ir; 1679 this->result = reg_undef; 1680 ir->accept(this); 1681 } 1682 if (failed) 1683 return false; 1684 1685 emit_fb_writes(); 1686 1687 split_virtual_grfs(); 1688 1689 setup_paramvalues_refs(); 1690 setup_pull_constants(); 1691 1692 bool progress; 1693 do { 1694 progress = false; 1695 1696 progress = remove_duplicate_mrf_writes() || progress; 1697 1698 progress = propagate_constants() || progress; 1699 progress = opt_algebraic() || progress; 1700 progress = register_coalesce() || progress; 1701 progress = compute_to_mrf() || progress; 1702 progress = dead_code_eliminate() || progress; 1703 } while (progress); 1704 1705 remove_dead_constants(); 1706 1707 schedule_instructions(); 1708 1709 assign_curb_setup(); 1710 assign_urb_setup(); 1711 1712 if (0) { 1713 /* Debug of register spilling: Go spill everything. */ 1714 int virtual_grf_count = virtual_grf_next; 1715 for (int i = 0; i < virtual_grf_count; i++) { 1716 spill_reg(i); 1717 } 1718 } 1719 1720 if (0) 1721 assign_regs_trivial(); 1722 else { 1723 while (!assign_regs()) { 1724 if (failed) 1725 break; 1726 } 1727 } 1728 } 1729 assert(force_uncompressed_stack == 0); 1730 assert(force_sechalf_stack == 0); 1731 1732 if (failed) 1733 return false; 1734 1735 generate_code(); 1736 1737 if (c->dispatch_width == 8) { 1738 c->prog_data.reg_blocks = brw_register_blocks(grf_used); 1739 } else { 1740 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used); 1741 c->prog_data.prog_offset_16 = prog_offset_16; 1742 1743 /* Make sure we didn't try to sneak in an extra uniform */ 1744 assert(orig_nr_params == c->prog_data.nr_params); 1745 } 1746 1747 return !failed; 1748} 1749 1750bool 1751brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c, 1752 struct gl_shader_program *prog) 1753{ 1754 struct intel_context *intel = &brw->intel; 1755 1756 if (!prog) 1757 return false; 1758 1759 struct brw_shader *shader = 1760 (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; 1761 if (!shader) 1762 return false; 1763 1764 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 1765 printf("GLSL IR for native fragment shader %d:\n", prog->Name); 1766 _mesa_print_ir(shader->ir, NULL); 1767 printf("\n\n"); 1768 } 1769 1770 /* Now the main event: Visit the shader IR and generate our FS IR for it. 1771 */ 1772 c->dispatch_width = 8; 1773 1774 fs_visitor v(c, prog, shader); 1775 if (!v.run()) { 1776 prog->LinkStatus = GL_FALSE; 1777 prog->InfoLog = ralloc_strdup(prog, v.fail_msg); 1778 1779 return false; 1780 } 1781 1782 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) { 1783 c->dispatch_width = 16; 1784 fs_visitor v2(c, prog, shader); 1785 v2.import_uniforms(&v); 1786 v2.run(); 1787 } 1788 1789 c->prog_data.dispatch_width = 8; 1790 1791 return true; 1792} 1793 1794bool 1795brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog) 1796{ 1797 struct brw_context *brw = brw_context(ctx); 1798 struct brw_wm_prog_key key; 1799 struct gl_fragment_program *fp = prog->FragmentProgram; 1800 struct brw_fragment_program *bfp = brw_fragment_program(fp); 1801 1802 if (!fp) 1803 return true; 1804 1805 memset(&key, 0, sizeof(key)); 1806 1807 if (fp->UsesKill) 1808 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT; 1809 1810 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) 1811 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT; 1812 1813 /* Just assume depth testing. */ 1814 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT; 1815 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT; 1816 1817 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS); 1818 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) { 1819 int vp_index = -1; 1820 1821 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i))) 1822 continue; 1823 1824 key.proj_attrib_mask |= 1 << i; 1825 1826 if (i <= FRAG_ATTRIB_TEX7) 1827 vp_index = i; 1828 else if (i >= FRAG_ATTRIB_VAR0) 1829 vp_index = i - FRAG_ATTRIB_VAR0 + VERT_RESULT_VAR0; 1830 1831 if (vp_index >= 0) 1832 key.vp_outputs_written |= BITFIELD64_BIT(vp_index); 1833 } 1834 1835 key.clamp_fragment_color = true; 1836 1837 for (int i = 0; i < BRW_MAX_TEX_UNIT; i++) { 1838 if (fp->Base.ShadowSamplers & (1 << i)) 1839 key.compare_funcs[i] = GL_LESS; 1840 1841 /* FINISHME: depth compares might use (0,0,0,W) for example */ 1842 key.tex_swizzles[i] = SWIZZLE_XYZW; 1843 } 1844 1845 if (fp->Base.InputsRead & FRAG_BIT_WPOS) { 1846 key.drawable_height = ctx->DrawBuffer->Height; 1847 key.render_to_fbo = ctx->DrawBuffer->Name != 0; 1848 } 1849 1850 key.nr_color_regions = 1; 1851 1852 key.program_string_id = bfp->id; 1853 1854 uint32_t old_prog_offset = brw->wm.prog_offset; 1855 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data; 1856 1857 bool success = do_wm_prog(brw, prog, bfp, &key); 1858 1859 brw->wm.prog_offset = old_prog_offset; 1860 brw->wm.prog_data = old_prog_data; 1861 1862 return success; 1863} 1864