brw_fs.cpp revision 4683529048ee133481b2d8f1cae1685aa1736f9a
1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24/** @file brw_fs.cpp 25 * 26 * This file drives the GLSL IR -> LIR translation, contains the 27 * optimizations on the LIR, and drives the generation of native code 28 * from the LIR. 29 */ 30 31extern "C" { 32 33#include <sys/types.h> 34 35#include "main/macros.h" 36#include "main/shaderobj.h" 37#include "main/uniforms.h" 38#include "program/prog_parameter.h" 39#include "program/prog_print.h" 40#include "program/register_allocate.h" 41#include "program/sampler.h" 42#include "program/hash_table.h" 43#include "brw_context.h" 44#include "brw_eu.h" 45#include "brw_wm.h" 46} 47#include "brw_shader.h" 48#include "brw_fs.h" 49#include "../glsl/glsl_types.h" 50#include "../glsl/ir_print_visitor.h" 51 52#define MAX_INSTRUCTION (1 << 30) 53 54int 55fs_visitor::type_size(const struct glsl_type *type) 56{ 57 unsigned int size, i; 58 59 switch (type->base_type) { 60 case GLSL_TYPE_UINT: 61 case GLSL_TYPE_INT: 62 case GLSL_TYPE_FLOAT: 63 case GLSL_TYPE_BOOL: 64 return type->components(); 65 case GLSL_TYPE_ARRAY: 66 return type_size(type->fields.array) * type->length; 67 case GLSL_TYPE_STRUCT: 68 size = 0; 69 for (i = 0; i < type->length; i++) { 70 size += type_size(type->fields.structure[i].type); 71 } 72 return size; 73 case GLSL_TYPE_SAMPLER: 74 /* Samplers take up no register space, since they're baked in at 75 * link time. 76 */ 77 return 0; 78 default: 79 assert(!"not reached"); 80 return 0; 81 } 82} 83 84void 85fs_visitor::fail(const char *format, ...) 86{ 87 va_list va; 88 char *msg; 89 90 if (failed) 91 return; 92 93 failed = true; 94 95 va_start(va, format); 96 msg = ralloc_vasprintf(mem_ctx, format, va); 97 va_end(va); 98 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg); 99 100 this->fail_msg = msg; 101 102 if (INTEL_DEBUG & DEBUG_WM) { 103 fprintf(stderr, "%s", msg); 104 } 105} 106 107void 108fs_visitor::push_force_uncompressed() 109{ 110 force_uncompressed_stack++; 111} 112 113void 114fs_visitor::pop_force_uncompressed() 115{ 116 force_uncompressed_stack--; 117 assert(force_uncompressed_stack >= 0); 118} 119 120void 121fs_visitor::push_force_sechalf() 122{ 123 force_sechalf_stack++; 124} 125 126void 127fs_visitor::pop_force_sechalf() 128{ 129 force_sechalf_stack--; 130 assert(force_sechalf_stack >= 0); 131} 132 133/** 134 * Returns how many MRFs an FS opcode will write over. 135 * 136 * Note that this is not the 0 or 1 implied writes in an actual gen 137 * instruction -- the FS opcodes often generate MOVs in addition. 138 */ 139int 140fs_visitor::implied_mrf_writes(fs_inst *inst) 141{ 142 if (inst->mlen == 0) 143 return 0; 144 145 switch (inst->opcode) { 146 case FS_OPCODE_RCP: 147 case FS_OPCODE_RSQ: 148 case FS_OPCODE_SQRT: 149 case FS_OPCODE_EXP2: 150 case FS_OPCODE_LOG2: 151 case FS_OPCODE_SIN: 152 case FS_OPCODE_COS: 153 return 1 * c->dispatch_width / 8; 154 case FS_OPCODE_POW: 155 return 2 * c->dispatch_width / 8; 156 case FS_OPCODE_TEX: 157 case FS_OPCODE_TXB: 158 case FS_OPCODE_TXD: 159 case FS_OPCODE_TXL: 160 return 1; 161 case FS_OPCODE_FB_WRITE: 162 return 2; 163 case FS_OPCODE_PULL_CONSTANT_LOAD: 164 case FS_OPCODE_UNSPILL: 165 return 1; 166 case FS_OPCODE_SPILL: 167 return 2; 168 default: 169 assert(!"not reached"); 170 return inst->mlen; 171 } 172} 173 174int 175fs_visitor::virtual_grf_alloc(int size) 176{ 177 if (virtual_grf_array_size <= virtual_grf_next) { 178 if (virtual_grf_array_size == 0) 179 virtual_grf_array_size = 16; 180 else 181 virtual_grf_array_size *= 2; 182 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int, 183 virtual_grf_array_size); 184 185 /* This slot is always unused. */ 186 virtual_grf_sizes[0] = 0; 187 } 188 virtual_grf_sizes[virtual_grf_next] = size; 189 return virtual_grf_next++; 190} 191 192/** Fixed HW reg constructor. */ 193fs_reg::fs_reg(enum register_file file, int hw_reg) 194{ 195 init(); 196 this->file = file; 197 this->hw_reg = hw_reg; 198 this->type = BRW_REGISTER_TYPE_F; 199} 200 201/** Fixed HW reg constructor. */ 202fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type) 203{ 204 init(); 205 this->file = file; 206 this->hw_reg = hw_reg; 207 this->type = type; 208} 209 210/** Automatic reg constructor. */ 211fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type) 212{ 213 init(); 214 215 this->file = GRF; 216 this->reg = v->virtual_grf_alloc(v->type_size(type)); 217 this->reg_offset = 0; 218 this->type = brw_type_for_base_type(type); 219} 220 221fs_reg * 222fs_visitor::variable_storage(ir_variable *var) 223{ 224 return (fs_reg *)hash_table_find(this->variable_ht, var); 225} 226 227void 228import_uniforms_callback(const void *key, 229 void *data, 230 void *closure) 231{ 232 struct hash_table *dst_ht = (struct hash_table *)closure; 233 const fs_reg *reg = (const fs_reg *)data; 234 235 if (reg->file != UNIFORM) 236 return; 237 238 hash_table_insert(dst_ht, data, key); 239} 240 241/* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch. 242 * This brings in those uniform definitions 243 */ 244void 245fs_visitor::import_uniforms(struct hash_table *src_variable_ht) 246{ 247 hash_table_call_foreach(src_variable_ht, 248 import_uniforms_callback, 249 variable_ht); 250} 251 252/* Our support for uniforms is piggy-backed on the struct 253 * gl_fragment_program, because that's where the values actually 254 * get stored, rather than in some global gl_shader_program uniform 255 * store. 256 */ 257int 258fs_visitor::setup_uniform_values(int loc, const glsl_type *type) 259{ 260 unsigned int offset = 0; 261 262 if (type->is_matrix()) { 263 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT, 264 type->vector_elements, 265 1); 266 267 for (unsigned int i = 0; i < type->matrix_columns; i++) { 268 offset += setup_uniform_values(loc + offset, column); 269 } 270 271 return offset; 272 } 273 274 switch (type->base_type) { 275 case GLSL_TYPE_FLOAT: 276 case GLSL_TYPE_UINT: 277 case GLSL_TYPE_INT: 278 case GLSL_TYPE_BOOL: 279 for (unsigned int i = 0; i < type->vector_elements; i++) { 280 unsigned int param = c->prog_data.nr_params++; 281 282 assert(param < ARRAY_SIZE(c->prog_data.param)); 283 284 switch (type->base_type) { 285 case GLSL_TYPE_FLOAT: 286 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 287 break; 288 case GLSL_TYPE_UINT: 289 c->prog_data.param_convert[param] = PARAM_CONVERT_F2U; 290 break; 291 case GLSL_TYPE_INT: 292 c->prog_data.param_convert[param] = PARAM_CONVERT_F2I; 293 break; 294 case GLSL_TYPE_BOOL: 295 c->prog_data.param_convert[param] = PARAM_CONVERT_F2B; 296 break; 297 default: 298 assert(!"not reached"); 299 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 300 break; 301 } 302 this->param_index[param] = loc; 303 this->param_offset[param] = i; 304 } 305 return 1; 306 307 case GLSL_TYPE_STRUCT: 308 for (unsigned int i = 0; i < type->length; i++) { 309 offset += setup_uniform_values(loc + offset, 310 type->fields.structure[i].type); 311 } 312 return offset; 313 314 case GLSL_TYPE_ARRAY: 315 for (unsigned int i = 0; i < type->length; i++) { 316 offset += setup_uniform_values(loc + offset, type->fields.array); 317 } 318 return offset; 319 320 case GLSL_TYPE_SAMPLER: 321 /* The sampler takes up a slot, but we don't use any values from it. */ 322 return 1; 323 324 default: 325 assert(!"not reached"); 326 return 0; 327 } 328} 329 330 331/* Our support for builtin uniforms is even scarier than non-builtin. 332 * It sits on top of the PROG_STATE_VAR parameters that are 333 * automatically updated from GL context state. 334 */ 335void 336fs_visitor::setup_builtin_uniform_values(ir_variable *ir) 337{ 338 const ir_state_slot *const slots = ir->state_slots; 339 assert(ir->state_slots != NULL); 340 341 for (unsigned int i = 0; i < ir->num_state_slots; i++) { 342 /* This state reference has already been setup by ir_to_mesa, but we'll 343 * get the same index back here. 344 */ 345 int index = _mesa_add_state_reference(this->fp->Base.Parameters, 346 (gl_state_index *)slots[i].tokens); 347 348 /* Add each of the unique swizzles of the element as a parameter. 349 * This'll end up matching the expected layout of the 350 * array/matrix/structure we're trying to fill in. 351 */ 352 int last_swiz = -1; 353 for (unsigned int j = 0; j < 4; j++) { 354 int swiz = GET_SWZ(slots[i].swizzle, j); 355 if (swiz == last_swiz) 356 break; 357 last_swiz = swiz; 358 359 c->prog_data.param_convert[c->prog_data.nr_params] = 360 PARAM_NO_CONVERT; 361 this->param_index[c->prog_data.nr_params] = index; 362 this->param_offset[c->prog_data.nr_params] = swiz; 363 c->prog_data.nr_params++; 364 } 365 } 366} 367 368fs_reg * 369fs_visitor::emit_fragcoord_interpolation(ir_variable *ir) 370{ 371 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 372 fs_reg wpos = *reg; 373 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo; 374 375 /* gl_FragCoord.x */ 376 if (ir->pixel_center_integer) { 377 emit(BRW_OPCODE_MOV, wpos, this->pixel_x); 378 } else { 379 emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f)); 380 } 381 wpos.reg_offset++; 382 383 /* gl_FragCoord.y */ 384 if (!flip && ir->pixel_center_integer) { 385 emit(BRW_OPCODE_MOV, wpos, this->pixel_y); 386 } else { 387 fs_reg pixel_y = this->pixel_y; 388 float offset = (ir->pixel_center_integer ? 0.0 : 0.5); 389 390 if (flip) { 391 pixel_y.negate = true; 392 offset += c->key.drawable_height - 1.0; 393 } 394 395 emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset)); 396 } 397 wpos.reg_offset++; 398 399 /* gl_FragCoord.z */ 400 if (intel->gen >= 6) { 401 emit(BRW_OPCODE_MOV, wpos, 402 fs_reg(brw_vec8_grf(c->source_depth_reg, 0))); 403 } else { 404 emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y, 405 interp_reg(FRAG_ATTRIB_WPOS, 2)); 406 } 407 wpos.reg_offset++; 408 409 /* gl_FragCoord.w: Already set up in emit_interpolation */ 410 emit(BRW_OPCODE_MOV, wpos, this->wpos_w); 411 412 return reg; 413} 414 415fs_reg * 416fs_visitor::emit_general_interpolation(ir_variable *ir) 417{ 418 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 419 /* Interpolation is always in floating point regs. */ 420 reg->type = BRW_REGISTER_TYPE_F; 421 fs_reg attr = *reg; 422 423 unsigned int array_elements; 424 const glsl_type *type; 425 426 if (ir->type->is_array()) { 427 array_elements = ir->type->length; 428 if (array_elements == 0) { 429 fail("dereferenced array '%s' has length 0\n", ir->name); 430 } 431 type = ir->type->fields.array; 432 } else { 433 array_elements = 1; 434 type = ir->type; 435 } 436 437 int location = ir->location; 438 for (unsigned int i = 0; i < array_elements; i++) { 439 for (unsigned int j = 0; j < type->matrix_columns; j++) { 440 if (urb_setup[location] == -1) { 441 /* If there's no incoming setup data for this slot, don't 442 * emit interpolation for it. 443 */ 444 attr.reg_offset += type->vector_elements; 445 location++; 446 continue; 447 } 448 449 bool is_gl_Color = 450 location == FRAG_ATTRIB_COL0 || location == FRAG_ATTRIB_COL1; 451 452 if (c->key.flat_shade && is_gl_Color) { 453 /* Constant interpolation (flat shading) case. The SF has 454 * handed us defined values in only the constant offset 455 * field of the setup reg. 456 */ 457 for (unsigned int k = 0; k < type->vector_elements; k++) { 458 struct brw_reg interp = interp_reg(location, k); 459 interp = suboffset(interp, 3); 460 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp)); 461 attr.reg_offset++; 462 } 463 } else { 464 /* Perspective interpolation case. */ 465 for (unsigned int k = 0; k < type->vector_elements; k++) { 466 /* FINISHME: At some point we probably want to push 467 * this farther by giving similar treatment to the 468 * other potentially constant components of the 469 * attribute, as well as making brw_vs_constval.c 470 * handle varyings other than gl_TexCoord. 471 */ 472 if (location >= FRAG_ATTRIB_TEX0 && 473 location <= FRAG_ATTRIB_TEX7 && 474 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) { 475 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f)); 476 } else { 477 struct brw_reg interp = interp_reg(location, k); 478 emit(FS_OPCODE_LINTERP, attr, 479 this->delta_x, this->delta_y, fs_reg(interp)); 480 } 481 attr.reg_offset++; 482 } 483 484 if (intel->gen < 6) { 485 attr.reg_offset -= type->vector_elements; 486 for (unsigned int k = 0; k < type->vector_elements; k++) { 487 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w); 488 attr.reg_offset++; 489 } 490 } 491 } 492 location++; 493 } 494 } 495 496 return reg; 497} 498 499fs_reg * 500fs_visitor::emit_frontfacing_interpolation(ir_variable *ir) 501{ 502 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 503 504 /* The frontfacing comes in as a bit in the thread payload. */ 505 if (intel->gen >= 6) { 506 emit(BRW_OPCODE_ASR, *reg, 507 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)), 508 fs_reg(15)); 509 emit(BRW_OPCODE_NOT, *reg, *reg); 510 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1)); 511 } else { 512 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); 513 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives 514 * us front face 515 */ 516 fs_inst *inst = emit(BRW_OPCODE_CMP, *reg, 517 fs_reg(r1_6ud), 518 fs_reg(1u << 31)); 519 inst->conditional_mod = BRW_CONDITIONAL_L; 520 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u)); 521 } 522 523 return reg; 524} 525 526fs_inst * 527fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src) 528{ 529 switch (opcode) { 530 case FS_OPCODE_RCP: 531 case FS_OPCODE_RSQ: 532 case FS_OPCODE_SQRT: 533 case FS_OPCODE_EXP2: 534 case FS_OPCODE_LOG2: 535 case FS_OPCODE_SIN: 536 case FS_OPCODE_COS: 537 break; 538 default: 539 assert(!"not reached: bad math opcode"); 540 return NULL; 541 } 542 543 /* Can't do hstride == 0 args to gen6 math, so expand it out. We 544 * might be able to do better by doing execsize = 1 math and then 545 * expanding that result out, but we would need to be careful with 546 * masking. 547 * 548 * The hardware ignores source modifiers (negate and abs) on math 549 * instructions, so we also move to a temp to set those up. 550 */ 551 if (intel->gen >= 6 && (src.file == UNIFORM || 552 src.abs || 553 src.negate)) { 554 fs_reg expanded = fs_reg(this, glsl_type::float_type); 555 emit(BRW_OPCODE_MOV, expanded, src); 556 src = expanded; 557 } 558 559 fs_inst *inst = emit(opcode, dst, src); 560 561 if (intel->gen < 6) { 562 inst->base_mrf = 2; 563 inst->mlen = c->dispatch_width / 8; 564 } 565 566 return inst; 567} 568 569fs_inst * 570fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1) 571{ 572 int base_mrf = 2; 573 fs_inst *inst; 574 575 assert(opcode == FS_OPCODE_POW); 576 577 if (intel->gen >= 6) { 578 /* Can't do hstride == 0 args to gen6 math, so expand it out. 579 * 580 * The hardware ignores source modifiers (negate and abs) on math 581 * instructions, so we also move to a temp to set those up. 582 */ 583 if (src0.file == UNIFORM || src0.abs || src0.negate) { 584 fs_reg expanded = fs_reg(this, glsl_type::float_type); 585 emit(BRW_OPCODE_MOV, expanded, src0); 586 src0 = expanded; 587 } 588 589 if (src1.file == UNIFORM || src1.abs || src1.negate) { 590 fs_reg expanded = fs_reg(this, glsl_type::float_type); 591 emit(BRW_OPCODE_MOV, expanded, src1); 592 src1 = expanded; 593 } 594 595 inst = emit(opcode, dst, src0, src1); 596 } else { 597 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1); 598 inst = emit(opcode, dst, src0, reg_null_f); 599 600 inst->base_mrf = base_mrf; 601 inst->mlen = 2 * c->dispatch_width / 8; 602 } 603 return inst; 604} 605 606/** 607 * To be called after the last _mesa_add_state_reference() call, to 608 * set up prog_data.param[] for assign_curb_setup() and 609 * setup_pull_constants(). 610 */ 611void 612fs_visitor::setup_paramvalues_refs() 613{ 614 if (c->dispatch_width != 8) 615 return; 616 617 /* Set up the pointers to ParamValues now that that array is finalized. */ 618 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 619 c->prog_data.param[i] = 620 (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] + 621 this->param_offset[i]; 622 } 623} 624 625void 626fs_visitor::assign_curb_setup() 627{ 628 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8; 629 if (c->dispatch_width == 8) { 630 c->prog_data.first_curbe_grf = c->nr_payload_regs; 631 } else { 632 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs; 633 } 634 635 /* Map the offsets in the UNIFORM file to fixed HW regs. */ 636 foreach_list(node, &this->instructions) { 637 fs_inst *inst = (fs_inst *)node; 638 639 for (unsigned int i = 0; i < 3; i++) { 640 if (inst->src[i].file == UNIFORM) { 641 int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 642 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs + 643 constant_nr / 8, 644 constant_nr % 8); 645 646 inst->src[i].file = FIXED_HW_REG; 647 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type); 648 } 649 } 650 } 651} 652 653void 654fs_visitor::calculate_urb_setup() 655{ 656 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 657 urb_setup[i] = -1; 658 } 659 660 int urb_next = 0; 661 /* Figure out where each of the incoming setup attributes lands. */ 662 if (intel->gen >= 6) { 663 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 664 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) { 665 urb_setup[i] = urb_next++; 666 } 667 } 668 } else { 669 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */ 670 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) { 671 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) { 672 int fp_index; 673 674 if (i >= VERT_RESULT_VAR0) 675 fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0); 676 else if (i <= VERT_RESULT_TEX7) 677 fp_index = i; 678 else 679 fp_index = -1; 680 681 if (fp_index >= 0) 682 urb_setup[fp_index] = urb_next++; 683 } 684 } 685 } 686 687 /* Each attribute is 4 setup channels, each of which is half a reg. */ 688 c->prog_data.urb_read_length = urb_next * 2; 689} 690 691void 692fs_visitor::assign_urb_setup() 693{ 694 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length; 695 696 /* Offset all the urb_setup[] index by the actual position of the 697 * setup regs, now that the location of the constants has been chosen. 698 */ 699 foreach_list(node, &this->instructions) { 700 fs_inst *inst = (fs_inst *)node; 701 702 if (inst->opcode == FS_OPCODE_LINTERP) { 703 assert(inst->src[2].file == FIXED_HW_REG); 704 inst->src[2].fixed_hw_reg.nr += urb_start; 705 } 706 707 if (inst->opcode == FS_OPCODE_CINTERP) { 708 assert(inst->src[0].file == FIXED_HW_REG); 709 inst->src[0].fixed_hw_reg.nr += urb_start; 710 } 711 } 712 713 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length; 714} 715 716/** 717 * Split large virtual GRFs into separate components if we can. 718 * 719 * This is mostly duplicated with what brw_fs_vector_splitting does, 720 * but that's really conservative because it's afraid of doing 721 * splitting that doesn't result in real progress after the rest of 722 * the optimization phases, which would cause infinite looping in 723 * optimization. We can do it once here, safely. This also has the 724 * opportunity to split interpolated values, or maybe even uniforms, 725 * which we don't have at the IR level. 726 * 727 * We want to split, because virtual GRFs are what we register 728 * allocate and spill (due to contiguousness requirements for some 729 * instructions), and they're what we naturally generate in the 730 * codegen process, but most virtual GRFs don't actually need to be 731 * contiguous sets of GRFs. If we split, we'll end up with reduced 732 * live intervals and better dead code elimination and coalescing. 733 */ 734void 735fs_visitor::split_virtual_grfs() 736{ 737 int num_vars = this->virtual_grf_next; 738 bool split_grf[num_vars]; 739 int new_virtual_grf[num_vars]; 740 741 /* Try to split anything > 0 sized. */ 742 for (int i = 0; i < num_vars; i++) { 743 if (this->virtual_grf_sizes[i] != 1) 744 split_grf[i] = true; 745 else 746 split_grf[i] = false; 747 } 748 749 if (brw->has_pln) { 750 /* PLN opcodes rely on the delta_xy being contiguous. */ 751 split_grf[this->delta_x.reg] = false; 752 } 753 754 foreach_list(node, &this->instructions) { 755 fs_inst *inst = (fs_inst *)node; 756 757 /* Texturing produces 4 contiguous registers, so no splitting. */ 758 if (inst->is_tex()) { 759 split_grf[inst->dst.reg] = false; 760 } 761 } 762 763 /* Allocate new space for split regs. Note that the virtual 764 * numbers will be contiguous. 765 */ 766 for (int i = 0; i < num_vars; i++) { 767 if (split_grf[i]) { 768 new_virtual_grf[i] = virtual_grf_alloc(1); 769 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) { 770 int reg = virtual_grf_alloc(1); 771 assert(reg == new_virtual_grf[i] + j - 1); 772 (void) reg; 773 } 774 this->virtual_grf_sizes[i] = 1; 775 } 776 } 777 778 foreach_list(node, &this->instructions) { 779 fs_inst *inst = (fs_inst *)node; 780 781 if (inst->dst.file == GRF && 782 split_grf[inst->dst.reg] && 783 inst->dst.reg_offset != 0) { 784 inst->dst.reg = (new_virtual_grf[inst->dst.reg] + 785 inst->dst.reg_offset - 1); 786 inst->dst.reg_offset = 0; 787 } 788 for (int i = 0; i < 3; i++) { 789 if (inst->src[i].file == GRF && 790 split_grf[inst->src[i].reg] && 791 inst->src[i].reg_offset != 0) { 792 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] + 793 inst->src[i].reg_offset - 1); 794 inst->src[i].reg_offset = 0; 795 } 796 } 797 } 798 this->live_intervals_valid = false; 799} 800 801/** 802 * Choose accesses from the UNIFORM file to demote to using the pull 803 * constant buffer. 804 * 805 * We allow a fragment shader to have more than the specified minimum 806 * maximum number of fragment shader uniform components (64). If 807 * there are too many of these, they'd fill up all of register space. 808 * So, this will push some of them out to the pull constant buffer and 809 * update the program to load them. 810 */ 811void 812fs_visitor::setup_pull_constants() 813{ 814 /* Only allow 16 registers (128 uniform components) as push constants. */ 815 unsigned int max_uniform_components = 16 * 8; 816 if (c->prog_data.nr_params <= max_uniform_components) 817 return; 818 819 if (c->dispatch_width == 16) { 820 fail("Pull constants not supported in 16-wide\n"); 821 return; 822 } 823 824 /* Just demote the end of the list. We could probably do better 825 * here, demoting things that are rarely used in the program first. 826 */ 827 int pull_uniform_base = max_uniform_components; 828 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base; 829 830 foreach_list(node, &this->instructions) { 831 fs_inst *inst = (fs_inst *)node; 832 833 for (int i = 0; i < 3; i++) { 834 if (inst->src[i].file != UNIFORM) 835 continue; 836 837 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 838 if (uniform_nr < pull_uniform_base) 839 continue; 840 841 fs_reg dst = fs_reg(this, glsl_type::float_type); 842 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD, 843 dst); 844 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15; 845 pull->ir = inst->ir; 846 pull->annotation = inst->annotation; 847 pull->base_mrf = 14; 848 pull->mlen = 1; 849 850 inst->insert_before(pull); 851 852 inst->src[i].file = GRF; 853 inst->src[i].reg = dst.reg; 854 inst->src[i].reg_offset = 0; 855 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3; 856 } 857 } 858 859 for (int i = 0; i < pull_uniform_count; i++) { 860 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i]; 861 c->prog_data.pull_param_convert[i] = 862 c->prog_data.param_convert[pull_uniform_base + i]; 863 } 864 c->prog_data.nr_params -= pull_uniform_count; 865 c->prog_data.nr_pull_params = pull_uniform_count; 866} 867 868void 869fs_visitor::calculate_live_intervals() 870{ 871 int num_vars = this->virtual_grf_next; 872 int *def = ralloc_array(mem_ctx, int, num_vars); 873 int *use = ralloc_array(mem_ctx, int, num_vars); 874 int loop_depth = 0; 875 int loop_start = 0; 876 877 if (this->live_intervals_valid) 878 return; 879 880 for (int i = 0; i < num_vars; i++) { 881 def[i] = MAX_INSTRUCTION; 882 use[i] = -1; 883 } 884 885 int ip = 0; 886 foreach_list(node, &this->instructions) { 887 fs_inst *inst = (fs_inst *)node; 888 889 if (inst->opcode == BRW_OPCODE_DO) { 890 if (loop_depth++ == 0) 891 loop_start = ip; 892 } else if (inst->opcode == BRW_OPCODE_WHILE) { 893 loop_depth--; 894 895 if (loop_depth == 0) { 896 /* Patches up the use of vars marked for being live across 897 * the whole loop. 898 */ 899 for (int i = 0; i < num_vars; i++) { 900 if (use[i] == loop_start) { 901 use[i] = ip; 902 } 903 } 904 } 905 } else { 906 for (unsigned int i = 0; i < 3; i++) { 907 if (inst->src[i].file == GRF && inst->src[i].reg != 0) { 908 int reg = inst->src[i].reg; 909 910 if (!loop_depth) { 911 use[reg] = ip; 912 } else { 913 def[reg] = MIN2(loop_start, def[reg]); 914 use[reg] = loop_start; 915 916 /* Nobody else is going to go smash our start to 917 * later in the loop now, because def[reg] now 918 * points before the bb header. 919 */ 920 } 921 } 922 } 923 if (inst->dst.file == GRF && inst->dst.reg != 0) { 924 int reg = inst->dst.reg; 925 926 if (!loop_depth) { 927 def[reg] = MIN2(def[reg], ip); 928 } else { 929 def[reg] = MIN2(def[reg], loop_start); 930 } 931 } 932 } 933 934 ip++; 935 } 936 937 ralloc_free(this->virtual_grf_def); 938 ralloc_free(this->virtual_grf_use); 939 this->virtual_grf_def = def; 940 this->virtual_grf_use = use; 941 942 this->live_intervals_valid = true; 943} 944 945/** 946 * Attempts to move immediate constants into the immediate 947 * constant slot of following instructions. 948 * 949 * Immediate constants are a bit tricky -- they have to be in the last 950 * operand slot, you can't do abs/negate on them, 951 */ 952 953bool 954fs_visitor::propagate_constants() 955{ 956 bool progress = false; 957 958 calculate_live_intervals(); 959 960 foreach_list(node, &this->instructions) { 961 fs_inst *inst = (fs_inst *)node; 962 963 if (inst->opcode != BRW_OPCODE_MOV || 964 inst->predicated || 965 inst->dst.file != GRF || inst->src[0].file != IMM || 966 inst->dst.type != inst->src[0].type || 967 (c->dispatch_width == 16 && 968 (inst->force_uncompressed || inst->force_sechalf))) 969 continue; 970 971 /* Don't bother with cases where we should have had the 972 * operation on the constant folded in GLSL already. 973 */ 974 if (inst->saturate) 975 continue; 976 977 /* Found a move of a constant to a GRF. Find anything else using the GRF 978 * before it's written, and replace it with the constant if we can. 979 */ 980 for (fs_inst *scan_inst = (fs_inst *)inst->next; 981 !scan_inst->is_tail_sentinel(); 982 scan_inst = (fs_inst *)scan_inst->next) { 983 if (scan_inst->opcode == BRW_OPCODE_DO || 984 scan_inst->opcode == BRW_OPCODE_WHILE || 985 scan_inst->opcode == BRW_OPCODE_ELSE || 986 scan_inst->opcode == BRW_OPCODE_ENDIF) { 987 break; 988 } 989 990 for (int i = 2; i >= 0; i--) { 991 if (scan_inst->src[i].file != GRF || 992 scan_inst->src[i].reg != inst->dst.reg || 993 scan_inst->src[i].reg_offset != inst->dst.reg_offset) 994 continue; 995 996 /* Don't bother with cases where we should have had the 997 * operation on the constant folded in GLSL already. 998 */ 999 if (scan_inst->src[i].negate || scan_inst->src[i].abs) 1000 continue; 1001 1002 switch (scan_inst->opcode) { 1003 case BRW_OPCODE_MOV: 1004 scan_inst->src[i] = inst->src[0]; 1005 progress = true; 1006 break; 1007 1008 case BRW_OPCODE_MUL: 1009 case BRW_OPCODE_ADD: 1010 if (i == 1) { 1011 scan_inst->src[i] = inst->src[0]; 1012 progress = true; 1013 } else if (i == 0 && scan_inst->src[1].file != IMM) { 1014 /* Fit this constant in by commuting the operands */ 1015 scan_inst->src[0] = scan_inst->src[1]; 1016 scan_inst->src[1] = inst->src[0]; 1017 progress = true; 1018 } 1019 break; 1020 1021 case BRW_OPCODE_CMP: 1022 if (i == 1) { 1023 scan_inst->src[i] = inst->src[0]; 1024 progress = true; 1025 } else if (i == 0 && scan_inst->src[1].file != IMM) { 1026 uint32_t new_cmod; 1027 1028 new_cmod = brw_swap_cmod(scan_inst->conditional_mod); 1029 if (new_cmod != ~0u) { 1030 /* Fit this constant in by swapping the operands and 1031 * flipping the test 1032 */ 1033 scan_inst->src[0] = scan_inst->src[1]; 1034 scan_inst->src[1] = inst->src[0]; 1035 scan_inst->conditional_mod = new_cmod; 1036 progress = true; 1037 } 1038 } 1039 break; 1040 1041 case BRW_OPCODE_SEL: 1042 if (i == 1) { 1043 scan_inst->src[i] = inst->src[0]; 1044 progress = true; 1045 } else if (i == 0 && scan_inst->src[1].file != IMM) { 1046 scan_inst->src[0] = scan_inst->src[1]; 1047 scan_inst->src[1] = inst->src[0]; 1048 1049 /* If this was predicated, flipping operands means 1050 * we also need to flip the predicate. 1051 */ 1052 if (scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) { 1053 scan_inst->predicate_inverse = 1054 !scan_inst->predicate_inverse; 1055 } 1056 progress = true; 1057 } 1058 break; 1059 1060 case FS_OPCODE_RCP: 1061 /* The hardware doesn't do math on immediate values 1062 * (because why are you doing that, seriously?), but 1063 * the correct answer is to just constant fold it 1064 * anyway. 1065 */ 1066 assert(i == 0); 1067 if (inst->src[0].imm.f != 0.0f) { 1068 scan_inst->opcode = BRW_OPCODE_MOV; 1069 scan_inst->src[0] = inst->src[0]; 1070 scan_inst->src[0].imm.f = 1.0f / scan_inst->src[0].imm.f; 1071 progress = true; 1072 } 1073 break; 1074 } 1075 } 1076 1077 if (scan_inst->dst.file == GRF && 1078 scan_inst->dst.reg == inst->dst.reg && 1079 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 1080 scan_inst->is_tex())) { 1081 break; 1082 } 1083 } 1084 } 1085 1086 if (progress) 1087 this->live_intervals_valid = false; 1088 1089 return progress; 1090} 1091 1092 1093/** 1094 * Attempts to move immediate constants into the immediate 1095 * constant slot of following instructions. 1096 * 1097 * Immediate constants are a bit tricky -- they have to be in the last 1098 * operand slot, you can't do abs/negate on them, 1099 */ 1100 1101bool 1102fs_visitor::opt_algebraic() 1103{ 1104 bool progress = false; 1105 1106 calculate_live_intervals(); 1107 1108 foreach_list(node, &this->instructions) { 1109 fs_inst *inst = (fs_inst *)node; 1110 1111 switch (inst->opcode) { 1112 case BRW_OPCODE_MUL: 1113 if (inst->src[1].file != IMM) 1114 continue; 1115 1116 /* a * 1.0 = a */ 1117 if (inst->src[1].type == BRW_REGISTER_TYPE_F && 1118 inst->src[1].imm.f == 1.0) { 1119 inst->opcode = BRW_OPCODE_MOV; 1120 inst->src[1] = reg_undef; 1121 progress = true; 1122 break; 1123 } 1124 1125 break; 1126 } 1127 } 1128 1129 return progress; 1130} 1131 1132/** 1133 * Must be called after calculate_live_intervales() to remove unused 1134 * writes to registers -- register allocation will fail otherwise 1135 * because something deffed but not used won't be considered to 1136 * interfere with other regs. 1137 */ 1138bool 1139fs_visitor::dead_code_eliminate() 1140{ 1141 bool progress = false; 1142 int pc = 0; 1143 1144 calculate_live_intervals(); 1145 1146 foreach_list_safe(node, &this->instructions) { 1147 fs_inst *inst = (fs_inst *)node; 1148 1149 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) { 1150 inst->remove(); 1151 progress = true; 1152 } 1153 1154 pc++; 1155 } 1156 1157 if (progress) 1158 live_intervals_valid = false; 1159 1160 return progress; 1161} 1162 1163bool 1164fs_visitor::register_coalesce() 1165{ 1166 bool progress = false; 1167 int if_depth = 0; 1168 int loop_depth = 0; 1169 1170 foreach_list_safe(node, &this->instructions) { 1171 fs_inst *inst = (fs_inst *)node; 1172 1173 /* Make sure that we dominate the instructions we're going to 1174 * scan for interfering with our coalescing, or we won't have 1175 * scanned enough to see if anything interferes with our 1176 * coalescing. We don't dominate the following instructions if 1177 * we're in a loop or an if block. 1178 */ 1179 switch (inst->opcode) { 1180 case BRW_OPCODE_DO: 1181 loop_depth++; 1182 break; 1183 case BRW_OPCODE_WHILE: 1184 loop_depth--; 1185 break; 1186 case BRW_OPCODE_IF: 1187 if_depth++; 1188 break; 1189 case BRW_OPCODE_ENDIF: 1190 if_depth--; 1191 break; 1192 } 1193 if (loop_depth || if_depth) 1194 continue; 1195 1196 if (inst->opcode != BRW_OPCODE_MOV || 1197 inst->predicated || 1198 inst->saturate || 1199 inst->dst.file != GRF || (inst->src[0].file != GRF && 1200 inst->src[0].file != UNIFORM)|| 1201 inst->dst.type != inst->src[0].type) 1202 continue; 1203 1204 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate; 1205 1206 /* Found a move of a GRF to a GRF. Let's see if we can coalesce 1207 * them: check for no writes to either one until the exit of the 1208 * program. 1209 */ 1210 bool interfered = false; 1211 1212 for (fs_inst *scan_inst = (fs_inst *)inst->next; 1213 !scan_inst->is_tail_sentinel(); 1214 scan_inst = (fs_inst *)scan_inst->next) { 1215 if (scan_inst->dst.file == GRF) { 1216 if (scan_inst->dst.reg == inst->dst.reg && 1217 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 1218 scan_inst->is_tex())) { 1219 interfered = true; 1220 break; 1221 } 1222 if (inst->src[0].file == GRF && 1223 scan_inst->dst.reg == inst->src[0].reg && 1224 (scan_inst->dst.reg_offset == inst->src[0].reg_offset || 1225 scan_inst->is_tex())) { 1226 interfered = true; 1227 break; 1228 } 1229 } 1230 1231 /* The gen6 MATH instruction can't handle source modifiers or 1232 * unusual register regions, so avoid coalescing those for 1233 * now. We should do something more specific. 1234 */ 1235 if (intel->gen >= 6 && 1236 scan_inst->is_math() && 1237 (has_source_modifiers || inst->src[0].file == UNIFORM)) { 1238 interfered = true; 1239 break; 1240 } 1241 } 1242 if (interfered) { 1243 continue; 1244 } 1245 1246 /* Rewrite the later usage to point at the source of the move to 1247 * be removed. 1248 */ 1249 for (fs_inst *scan_inst = inst; 1250 !scan_inst->is_tail_sentinel(); 1251 scan_inst = (fs_inst *)scan_inst->next) { 1252 for (int i = 0; i < 3; i++) { 1253 if (scan_inst->src[i].file == GRF && 1254 scan_inst->src[i].reg == inst->dst.reg && 1255 scan_inst->src[i].reg_offset == inst->dst.reg_offset) { 1256 fs_reg new_src = inst->src[0]; 1257 new_src.negate ^= scan_inst->src[i].negate; 1258 new_src.abs |= scan_inst->src[i].abs; 1259 scan_inst->src[i] = new_src; 1260 } 1261 } 1262 } 1263 1264 inst->remove(); 1265 progress = true; 1266 } 1267 1268 if (progress) 1269 live_intervals_valid = false; 1270 1271 return progress; 1272} 1273 1274 1275bool 1276fs_visitor::compute_to_mrf() 1277{ 1278 bool progress = false; 1279 int next_ip = 0; 1280 1281 calculate_live_intervals(); 1282 1283 foreach_list_safe(node, &this->instructions) { 1284 fs_inst *inst = (fs_inst *)node; 1285 1286 int ip = next_ip; 1287 next_ip++; 1288 1289 if (inst->opcode != BRW_OPCODE_MOV || 1290 inst->predicated || 1291 inst->dst.file != MRF || inst->src[0].file != GRF || 1292 inst->dst.type != inst->src[0].type || 1293 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1) 1294 continue; 1295 1296 /* Work out which hardware MRF registers are written by this 1297 * instruction. 1298 */ 1299 int mrf_low = inst->dst.hw_reg & ~BRW_MRF_COMPR4; 1300 int mrf_high; 1301 if (inst->dst.hw_reg & BRW_MRF_COMPR4) { 1302 mrf_high = mrf_low + 4; 1303 } else if (c->dispatch_width == 16 && 1304 (!inst->force_uncompressed && !inst->force_sechalf)) { 1305 mrf_high = mrf_low + 1; 1306 } else { 1307 mrf_high = mrf_low; 1308 } 1309 1310 /* Can't compute-to-MRF this GRF if someone else was going to 1311 * read it later. 1312 */ 1313 if (this->virtual_grf_use[inst->src[0].reg] > ip) 1314 continue; 1315 1316 /* Found a move of a GRF to a MRF. Let's see if we can go 1317 * rewrite the thing that made this GRF to write into the MRF. 1318 */ 1319 fs_inst *scan_inst; 1320 for (scan_inst = (fs_inst *)inst->prev; 1321 scan_inst->prev != NULL; 1322 scan_inst = (fs_inst *)scan_inst->prev) { 1323 if (scan_inst->dst.file == GRF && 1324 scan_inst->dst.reg == inst->src[0].reg) { 1325 /* Found the last thing to write our reg we want to turn 1326 * into a compute-to-MRF. 1327 */ 1328 1329 if (scan_inst->is_tex()) { 1330 /* texturing writes several continuous regs, so we can't 1331 * compute-to-mrf that. 1332 */ 1333 break; 1334 } 1335 1336 /* If it's predicated, it (probably) didn't populate all 1337 * the channels. We might be able to rewrite everything 1338 * that writes that reg, but it would require smarter 1339 * tracking to delay the rewriting until complete success. 1340 */ 1341 if (scan_inst->predicated) 1342 break; 1343 1344 /* If it's half of register setup and not the same half as 1345 * our MOV we're trying to remove, bail for now. 1346 */ 1347 if (scan_inst->force_uncompressed != inst->force_uncompressed || 1348 scan_inst->force_sechalf != inst->force_sechalf) { 1349 break; 1350 } 1351 1352 /* SEND instructions can't have MRF as a destination. */ 1353 if (scan_inst->mlen) 1354 break; 1355 1356 if (intel->gen >= 6) { 1357 /* gen6 math instructions must have the destination be 1358 * GRF, so no compute-to-MRF for them. 1359 */ 1360 if (scan_inst->is_math()) { 1361 break; 1362 } 1363 } 1364 1365 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 1366 /* Found the creator of our MRF's source value. */ 1367 scan_inst->dst.file = MRF; 1368 scan_inst->dst.hw_reg = inst->dst.hw_reg; 1369 scan_inst->saturate |= inst->saturate; 1370 inst->remove(); 1371 progress = true; 1372 } 1373 break; 1374 } 1375 1376 /* We don't handle flow control here. Most computation of 1377 * values that end up in MRFs are shortly before the MRF 1378 * write anyway. 1379 */ 1380 if (scan_inst->opcode == BRW_OPCODE_DO || 1381 scan_inst->opcode == BRW_OPCODE_WHILE || 1382 scan_inst->opcode == BRW_OPCODE_ELSE || 1383 scan_inst->opcode == BRW_OPCODE_ENDIF) { 1384 break; 1385 } 1386 1387 /* You can't read from an MRF, so if someone else reads our 1388 * MRF's source GRF that we wanted to rewrite, that stops us. 1389 */ 1390 bool interfered = false; 1391 for (int i = 0; i < 3; i++) { 1392 if (scan_inst->src[i].file == GRF && 1393 scan_inst->src[i].reg == inst->src[0].reg && 1394 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { 1395 interfered = true; 1396 } 1397 } 1398 if (interfered) 1399 break; 1400 1401 if (scan_inst->dst.file == MRF) { 1402 /* If somebody else writes our MRF here, we can't 1403 * compute-to-MRF before that. 1404 */ 1405 int scan_mrf_low = scan_inst->dst.hw_reg & ~BRW_MRF_COMPR4; 1406 int scan_mrf_high; 1407 1408 if (scan_inst->dst.hw_reg & BRW_MRF_COMPR4) { 1409 scan_mrf_high = scan_mrf_low + 4; 1410 } else if (c->dispatch_width == 16 && 1411 (!scan_inst->force_uncompressed && 1412 !scan_inst->force_sechalf)) { 1413 scan_mrf_high = scan_mrf_low + 1; 1414 } else { 1415 scan_mrf_high = scan_mrf_low; 1416 } 1417 1418 if (mrf_low == scan_mrf_low || 1419 mrf_low == scan_mrf_high || 1420 mrf_high == scan_mrf_low || 1421 mrf_high == scan_mrf_high) { 1422 break; 1423 } 1424 } 1425 1426 if (scan_inst->mlen > 0) { 1427 /* Found a SEND instruction, which means that there are 1428 * live values in MRFs from base_mrf to base_mrf + 1429 * scan_inst->mlen - 1. Don't go pushing our MRF write up 1430 * above it. 1431 */ 1432 if (mrf_low >= scan_inst->base_mrf && 1433 mrf_low < scan_inst->base_mrf + scan_inst->mlen) { 1434 break; 1435 } 1436 if (mrf_high >= scan_inst->base_mrf && 1437 mrf_high < scan_inst->base_mrf + scan_inst->mlen) { 1438 break; 1439 } 1440 } 1441 } 1442 } 1443 1444 return progress; 1445} 1446 1447/** 1448 * Walks through basic blocks, locking for repeated MRF writes and 1449 * removing the later ones. 1450 */ 1451bool 1452fs_visitor::remove_duplicate_mrf_writes() 1453{ 1454 fs_inst *last_mrf_move[16]; 1455 bool progress = false; 1456 1457 /* Need to update the MRF tracking for compressed instructions. */ 1458 if (c->dispatch_width == 16) 1459 return false; 1460 1461 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 1462 1463 foreach_list_safe(node, &this->instructions) { 1464 fs_inst *inst = (fs_inst *)node; 1465 1466 switch (inst->opcode) { 1467 case BRW_OPCODE_DO: 1468 case BRW_OPCODE_WHILE: 1469 case BRW_OPCODE_IF: 1470 case BRW_OPCODE_ELSE: 1471 case BRW_OPCODE_ENDIF: 1472 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 1473 continue; 1474 default: 1475 break; 1476 } 1477 1478 if (inst->opcode == BRW_OPCODE_MOV && 1479 inst->dst.file == MRF) { 1480 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg]; 1481 if (prev_inst && inst->equals(prev_inst)) { 1482 inst->remove(); 1483 progress = true; 1484 continue; 1485 } 1486 } 1487 1488 /* Clear out the last-write records for MRFs that were overwritten. */ 1489 if (inst->dst.file == MRF) { 1490 last_mrf_move[inst->dst.hw_reg] = NULL; 1491 } 1492 1493 if (inst->mlen > 0) { 1494 /* Found a SEND instruction, which will include two or fewer 1495 * implied MRF writes. We could do better here. 1496 */ 1497 for (int i = 0; i < implied_mrf_writes(inst); i++) { 1498 last_mrf_move[inst->base_mrf + i] = NULL; 1499 } 1500 } 1501 1502 /* Clear out any MRF move records whose sources got overwritten. */ 1503 if (inst->dst.file == GRF) { 1504 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) { 1505 if (last_mrf_move[i] && 1506 last_mrf_move[i]->src[0].reg == inst->dst.reg) { 1507 last_mrf_move[i] = NULL; 1508 } 1509 } 1510 } 1511 1512 if (inst->opcode == BRW_OPCODE_MOV && 1513 inst->dst.file == MRF && 1514 inst->src[0].file == GRF && 1515 !inst->predicated) { 1516 last_mrf_move[inst->dst.hw_reg] = inst; 1517 } 1518 } 1519 1520 return progress; 1521} 1522 1523bool 1524fs_visitor::virtual_grf_interferes(int a, int b) 1525{ 1526 int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]); 1527 int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]); 1528 1529 /* We can't handle dead register writes here, without iterating 1530 * over the whole instruction stream to find every single dead 1531 * write to that register to compare to the live interval of the 1532 * other register. Just assert that dead_code_eliminate() has been 1533 * called. 1534 */ 1535 assert((this->virtual_grf_use[a] != -1 || 1536 this->virtual_grf_def[a] == MAX_INSTRUCTION) && 1537 (this->virtual_grf_use[b] != -1 || 1538 this->virtual_grf_def[b] == MAX_INSTRUCTION)); 1539 1540 /* If the register is used to store 16 values of less than float 1541 * size (only the case for pixel_[xy]), then we can't allocate 1542 * another dword-sized thing to that register that would be used in 1543 * the same instruction. This is because when the GPU decodes (for 1544 * example): 1545 * 1546 * (declare (in ) vec4 gl_FragCoord@0x97766a0) 1547 * add(16) g6<1>F g6<8,8,1>UW 0.5F { align1 compr }; 1548 * 1549 * it's actually processed as: 1550 * add(8) g6<1>F g6<8,8,1>UW 0.5F { align1 }; 1551 * add(8) g7<1>F g6.8<8,8,1>UW 0.5F { align1 sechalf }; 1552 * 1553 * so our second half values in g6 got overwritten in the first 1554 * half. 1555 */ 1556 if (c->dispatch_width == 16 && (this->pixel_x.reg == a || 1557 this->pixel_x.reg == b || 1558 this->pixel_y.reg == a || 1559 this->pixel_y.reg == b)) { 1560 return start <= end; 1561 } 1562 1563 return start < end; 1564} 1565 1566bool 1567fs_visitor::run() 1568{ 1569 uint32_t prog_offset_16 = 0; 1570 uint32_t orig_nr_params = c->prog_data.nr_params; 1571 1572 brw_wm_payload_setup(brw, c); 1573 1574 if (c->dispatch_width == 16) { 1575 /* align to 64 byte boundary. */ 1576 while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) { 1577 brw_NOP(p); 1578 } 1579 1580 /* Save off the start of this 16-wide program in case we succeed. */ 1581 prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction); 1582 1583 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 1584 } 1585 1586 if (0) { 1587 emit_dummy_fs(); 1588 } else { 1589 calculate_urb_setup(); 1590 if (intel->gen < 6) 1591 emit_interpolation_setup_gen4(); 1592 else 1593 emit_interpolation_setup_gen6(); 1594 1595 /* Generate FS IR for main(). (the visitor only descends into 1596 * functions called "main"). 1597 */ 1598 foreach_list(node, &*shader->ir) { 1599 ir_instruction *ir = (ir_instruction *)node; 1600 base_ir = ir; 1601 this->result = reg_undef; 1602 ir->accept(this); 1603 } 1604 if (failed) 1605 return false; 1606 1607 emit_fb_writes(); 1608 1609 split_virtual_grfs(); 1610 1611 setup_paramvalues_refs(); 1612 setup_pull_constants(); 1613 1614 bool progress; 1615 do { 1616 progress = false; 1617 1618 progress = remove_duplicate_mrf_writes() || progress; 1619 1620 progress = propagate_constants() || progress; 1621 progress = opt_algebraic() || progress; 1622 progress = register_coalesce() || progress; 1623 progress = compute_to_mrf() || progress; 1624 progress = dead_code_eliminate() || progress; 1625 } while (progress); 1626 1627 schedule_instructions(); 1628 1629 assign_curb_setup(); 1630 assign_urb_setup(); 1631 1632 if (0) { 1633 /* Debug of register spilling: Go spill everything. */ 1634 int virtual_grf_count = virtual_grf_next; 1635 for (int i = 1; i < virtual_grf_count; i++) { 1636 spill_reg(i); 1637 } 1638 } 1639 1640 if (0) 1641 assign_regs_trivial(); 1642 else { 1643 while (!assign_regs()) { 1644 if (failed) 1645 break; 1646 } 1647 } 1648 } 1649 assert(force_uncompressed_stack == 0); 1650 assert(force_sechalf_stack == 0); 1651 1652 if (failed) 1653 return false; 1654 1655 generate_code(); 1656 1657 if (c->dispatch_width == 8) { 1658 c->prog_data.reg_blocks = brw_register_blocks(grf_used); 1659 } else { 1660 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used); 1661 c->prog_data.prog_offset_16 = prog_offset_16; 1662 1663 /* Make sure we didn't try to sneak in an extra uniform */ 1664 assert(orig_nr_params == c->prog_data.nr_params); 1665 } 1666 1667 return !failed; 1668} 1669 1670bool 1671brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c, 1672 struct gl_shader_program *prog) 1673{ 1674 struct intel_context *intel = &brw->intel; 1675 1676 if (!prog) 1677 return false; 1678 1679 struct brw_shader *shader = 1680 (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; 1681 if (!shader) 1682 return false; 1683 1684 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 1685 printf("GLSL IR for native fragment shader %d:\n", prog->Name); 1686 _mesa_print_ir(shader->ir, NULL); 1687 printf("\n\n"); 1688 } 1689 1690 /* Now the main event: Visit the shader IR and generate our FS IR for it. 1691 */ 1692 c->dispatch_width = 8; 1693 1694 fs_visitor v(c, prog, shader); 1695 if (!v.run()) { 1696 prog->LinkStatus = GL_FALSE; 1697 prog->InfoLog = ralloc_strdup(prog, v.fail_msg); 1698 1699 return false; 1700 } 1701 1702 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) { 1703 c->dispatch_width = 16; 1704 fs_visitor v2(c, prog, shader); 1705 v2.import_uniforms(v.variable_ht); 1706 v2.run(); 1707 } 1708 1709 c->prog_data.dispatch_width = 8; 1710 1711 return true; 1712} 1713 1714bool 1715brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog) 1716{ 1717 struct brw_context *brw = brw_context(ctx); 1718 struct brw_wm_prog_key key; 1719 struct gl_fragment_program *fp = prog->FragmentProgram; 1720 struct brw_fragment_program *bfp = brw_fragment_program(fp); 1721 1722 if (!fp) 1723 return true; 1724 1725 memset(&key, 0, sizeof(key)); 1726 1727 if (fp->UsesKill) 1728 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT; 1729 1730 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) 1731 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT; 1732 1733 /* Just assume depth testing. */ 1734 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT; 1735 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT; 1736 1737 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS); 1738 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) { 1739 int vp_index = -1; 1740 1741 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i))) 1742 continue; 1743 1744 key.proj_attrib_mask |= 1 << i; 1745 1746 if (i <= FRAG_ATTRIB_TEX7) 1747 vp_index = i; 1748 else if (i >= FRAG_ATTRIB_VAR0) 1749 vp_index = i - FRAG_ATTRIB_VAR0 + VERT_RESULT_VAR0; 1750 1751 if (vp_index >= 0) 1752 key.vp_outputs_written |= BITFIELD64_BIT(vp_index); 1753 } 1754 1755 key.clamp_fragment_color = true; 1756 1757 for (int i = 0; i < BRW_MAX_TEX_UNIT; i++) { 1758 if (fp->Base.ShadowSamplers & (1 << i)) 1759 key.compare_funcs[i] = GL_LESS; 1760 1761 /* FINISHME: depth compares might use (0,0,0,W) for example */ 1762 key.tex_swizzles[i] = SWIZZLE_XYZW; 1763 } 1764 1765 if (fp->Base.InputsRead & FRAG_BIT_WPOS) { 1766 key.drawable_height = ctx->DrawBuffer->Height; 1767 key.render_to_fbo = ctx->DrawBuffer->Name != 0; 1768 } 1769 1770 key.nr_color_regions = 1; 1771 1772 key.program_string_id = bfp->id; 1773 1774 uint32_t old_prog_offset = brw->wm.prog_offset; 1775 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data; 1776 1777 bool success = do_wm_prog(brw, prog, bfp, &key); 1778 1779 brw->wm.prog_offset = old_prog_offset; 1780 brw->wm.prog_data = old_prog_data; 1781 1782 return success; 1783} 1784