brw_fs.cpp revision 6430df37736d71dd2bd6f1fe447d39f0b68cb567
1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24/** @file brw_fs.cpp 25 * 26 * This file drives the GLSL IR -> LIR translation, contains the 27 * optimizations on the LIR, and drives the generation of native code 28 * from the LIR. 29 */ 30 31extern "C" { 32 33#include <sys/types.h> 34 35#include "main/macros.h" 36#include "main/shaderobj.h" 37#include "main/uniforms.h" 38#include "program/prog_parameter.h" 39#include "program/prog_print.h" 40#include "program/register_allocate.h" 41#include "program/sampler.h" 42#include "program/hash_table.h" 43#include "brw_context.h" 44#include "brw_eu.h" 45#include "brw_wm.h" 46} 47#include "brw_shader.h" 48#include "brw_fs.h" 49#include "../glsl/glsl_types.h" 50#include "../glsl/ir_print_visitor.h" 51 52#define MAX_INSTRUCTION (1 << 30) 53 54int 55fs_visitor::type_size(const struct glsl_type *type) 56{ 57 unsigned int size, i; 58 59 switch (type->base_type) { 60 case GLSL_TYPE_UINT: 61 case GLSL_TYPE_INT: 62 case GLSL_TYPE_FLOAT: 63 case GLSL_TYPE_BOOL: 64 return type->components(); 65 case GLSL_TYPE_ARRAY: 66 return type_size(type->fields.array) * type->length; 67 case GLSL_TYPE_STRUCT: 68 size = 0; 69 for (i = 0; i < type->length; i++) { 70 size += type_size(type->fields.structure[i].type); 71 } 72 return size; 73 case GLSL_TYPE_SAMPLER: 74 /* Samplers take up no register space, since they're baked in at 75 * link time. 76 */ 77 return 0; 78 default: 79 assert(!"not reached"); 80 return 0; 81 } 82} 83 84void 85fs_visitor::fail(const char *format, ...) 86{ 87 va_list va; 88 char *msg; 89 90 if (failed) 91 return; 92 93 failed = true; 94 95 va_start(va, format); 96 msg = ralloc_vasprintf(mem_ctx, format, va); 97 va_end(va); 98 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg); 99 100 this->fail_msg = msg; 101 102 if (INTEL_DEBUG & DEBUG_WM) { 103 fprintf(stderr, "%s", msg); 104 } 105} 106 107void 108fs_visitor::push_force_uncompressed() 109{ 110 force_uncompressed_stack++; 111} 112 113void 114fs_visitor::pop_force_uncompressed() 115{ 116 force_uncompressed_stack--; 117 assert(force_uncompressed_stack >= 0); 118} 119 120void 121fs_visitor::push_force_sechalf() 122{ 123 force_sechalf_stack++; 124} 125 126void 127fs_visitor::pop_force_sechalf() 128{ 129 force_sechalf_stack--; 130 assert(force_sechalf_stack >= 0); 131} 132 133/** 134 * Returns how many MRFs an FS opcode will write over. 135 * 136 * Note that this is not the 0 or 1 implied writes in an actual gen 137 * instruction -- the FS opcodes often generate MOVs in addition. 138 */ 139int 140fs_visitor::implied_mrf_writes(fs_inst *inst) 141{ 142 if (inst->mlen == 0) 143 return 0; 144 145 switch (inst->opcode) { 146 case FS_OPCODE_RCP: 147 case FS_OPCODE_RSQ: 148 case FS_OPCODE_SQRT: 149 case FS_OPCODE_EXP2: 150 case FS_OPCODE_LOG2: 151 case FS_OPCODE_SIN: 152 case FS_OPCODE_COS: 153 return 1 * c->dispatch_width / 8; 154 case FS_OPCODE_POW: 155 return 2 * c->dispatch_width / 8; 156 case FS_OPCODE_TEX: 157 case FS_OPCODE_TXB: 158 case FS_OPCODE_TXD: 159 case FS_OPCODE_TXL: 160 return 1; 161 case FS_OPCODE_FB_WRITE: 162 return 2; 163 case FS_OPCODE_PULL_CONSTANT_LOAD: 164 case FS_OPCODE_UNSPILL: 165 return 1; 166 case FS_OPCODE_SPILL: 167 return 2; 168 default: 169 assert(!"not reached"); 170 return inst->mlen; 171 } 172} 173 174int 175fs_visitor::virtual_grf_alloc(int size) 176{ 177 if (virtual_grf_array_size <= virtual_grf_next) { 178 if (virtual_grf_array_size == 0) 179 virtual_grf_array_size = 16; 180 else 181 virtual_grf_array_size *= 2; 182 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int, 183 virtual_grf_array_size); 184 185 /* This slot is always unused. */ 186 virtual_grf_sizes[0] = 0; 187 } 188 virtual_grf_sizes[virtual_grf_next] = size; 189 return virtual_grf_next++; 190} 191 192/** Fixed HW reg constructor. */ 193fs_reg::fs_reg(enum register_file file, int hw_reg) 194{ 195 init(); 196 this->file = file; 197 this->hw_reg = hw_reg; 198 this->type = BRW_REGISTER_TYPE_F; 199} 200 201/** Fixed HW reg constructor. */ 202fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type) 203{ 204 init(); 205 this->file = file; 206 this->hw_reg = hw_reg; 207 this->type = type; 208} 209 210/** Automatic reg constructor. */ 211fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type) 212{ 213 init(); 214 215 this->file = GRF; 216 this->reg = v->virtual_grf_alloc(v->type_size(type)); 217 this->reg_offset = 0; 218 this->type = brw_type_for_base_type(type); 219} 220 221fs_reg * 222fs_visitor::variable_storage(ir_variable *var) 223{ 224 return (fs_reg *)hash_table_find(this->variable_ht, var); 225} 226 227void 228import_uniforms_callback(const void *key, 229 void *data, 230 void *closure) 231{ 232 struct hash_table *dst_ht = (struct hash_table *)closure; 233 const fs_reg *reg = (const fs_reg *)data; 234 235 if (reg->file != UNIFORM) 236 return; 237 238 hash_table_insert(dst_ht, data, key); 239} 240 241/* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch. 242 * This brings in those uniform definitions 243 */ 244void 245fs_visitor::import_uniforms(struct hash_table *src_variable_ht) 246{ 247 hash_table_call_foreach(src_variable_ht, 248 import_uniforms_callback, 249 variable_ht); 250} 251 252/* Our support for uniforms is piggy-backed on the struct 253 * gl_fragment_program, because that's where the values actually 254 * get stored, rather than in some global gl_shader_program uniform 255 * store. 256 */ 257int 258fs_visitor::setup_uniform_values(int loc, const glsl_type *type) 259{ 260 unsigned int offset = 0; 261 262 if (type->is_matrix()) { 263 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT, 264 type->vector_elements, 265 1); 266 267 for (unsigned int i = 0; i < type->matrix_columns; i++) { 268 offset += setup_uniform_values(loc + offset, column); 269 } 270 271 return offset; 272 } 273 274 switch (type->base_type) { 275 case GLSL_TYPE_FLOAT: 276 case GLSL_TYPE_UINT: 277 case GLSL_TYPE_INT: 278 case GLSL_TYPE_BOOL: 279 for (unsigned int i = 0; i < type->vector_elements; i++) { 280 unsigned int param = c->prog_data.nr_params++; 281 282 assert(param < ARRAY_SIZE(c->prog_data.param)); 283 284 switch (type->base_type) { 285 case GLSL_TYPE_FLOAT: 286 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 287 break; 288 case GLSL_TYPE_UINT: 289 c->prog_data.param_convert[param] = PARAM_CONVERT_F2U; 290 break; 291 case GLSL_TYPE_INT: 292 c->prog_data.param_convert[param] = PARAM_CONVERT_F2I; 293 break; 294 case GLSL_TYPE_BOOL: 295 c->prog_data.param_convert[param] = PARAM_CONVERT_F2B; 296 break; 297 default: 298 assert(!"not reached"); 299 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 300 break; 301 } 302 this->param_index[param] = loc; 303 this->param_offset[param] = i; 304 } 305 return 1; 306 307 case GLSL_TYPE_STRUCT: 308 for (unsigned int i = 0; i < type->length; i++) { 309 offset += setup_uniform_values(loc + offset, 310 type->fields.structure[i].type); 311 } 312 return offset; 313 314 case GLSL_TYPE_ARRAY: 315 for (unsigned int i = 0; i < type->length; i++) { 316 offset += setup_uniform_values(loc + offset, type->fields.array); 317 } 318 return offset; 319 320 case GLSL_TYPE_SAMPLER: 321 /* The sampler takes up a slot, but we don't use any values from it. */ 322 return 1; 323 324 default: 325 assert(!"not reached"); 326 return 0; 327 } 328} 329 330 331/* Our support for builtin uniforms is even scarier than non-builtin. 332 * It sits on top of the PROG_STATE_VAR parameters that are 333 * automatically updated from GL context state. 334 */ 335void 336fs_visitor::setup_builtin_uniform_values(ir_variable *ir) 337{ 338 const ir_state_slot *const slots = ir->state_slots; 339 assert(ir->state_slots != NULL); 340 341 for (unsigned int i = 0; i < ir->num_state_slots; i++) { 342 /* This state reference has already been setup by ir_to_mesa, but we'll 343 * get the same index back here. 344 */ 345 int index = _mesa_add_state_reference(this->fp->Base.Parameters, 346 (gl_state_index *)slots[i].tokens); 347 348 /* Add each of the unique swizzles of the element as a parameter. 349 * This'll end up matching the expected layout of the 350 * array/matrix/structure we're trying to fill in. 351 */ 352 int last_swiz = -1; 353 for (unsigned int j = 0; j < 4; j++) { 354 int swiz = GET_SWZ(slots[i].swizzle, j); 355 if (swiz == last_swiz) 356 break; 357 last_swiz = swiz; 358 359 c->prog_data.param_convert[c->prog_data.nr_params] = 360 PARAM_NO_CONVERT; 361 this->param_index[c->prog_data.nr_params] = index; 362 this->param_offset[c->prog_data.nr_params] = swiz; 363 c->prog_data.nr_params++; 364 } 365 } 366} 367 368fs_reg * 369fs_visitor::emit_fragcoord_interpolation(ir_variable *ir) 370{ 371 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 372 fs_reg wpos = *reg; 373 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo; 374 375 /* gl_FragCoord.x */ 376 if (ir->pixel_center_integer) { 377 emit(BRW_OPCODE_MOV, wpos, this->pixel_x); 378 } else { 379 emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f)); 380 } 381 wpos.reg_offset++; 382 383 /* gl_FragCoord.y */ 384 if (!flip && ir->pixel_center_integer) { 385 emit(BRW_OPCODE_MOV, wpos, this->pixel_y); 386 } else { 387 fs_reg pixel_y = this->pixel_y; 388 float offset = (ir->pixel_center_integer ? 0.0 : 0.5); 389 390 if (flip) { 391 pixel_y.negate = true; 392 offset += c->key.drawable_height - 1.0; 393 } 394 395 emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset)); 396 } 397 wpos.reg_offset++; 398 399 /* gl_FragCoord.z */ 400 if (intel->gen >= 6) { 401 emit(BRW_OPCODE_MOV, wpos, 402 fs_reg(brw_vec8_grf(c->source_depth_reg, 0))); 403 } else { 404 emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y, 405 interp_reg(FRAG_ATTRIB_WPOS, 2)); 406 } 407 wpos.reg_offset++; 408 409 /* gl_FragCoord.w: Already set up in emit_interpolation */ 410 emit(BRW_OPCODE_MOV, wpos, this->wpos_w); 411 412 return reg; 413} 414 415fs_reg * 416fs_visitor::emit_general_interpolation(ir_variable *ir) 417{ 418 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 419 /* Interpolation is always in floating point regs. */ 420 reg->type = BRW_REGISTER_TYPE_F; 421 fs_reg attr = *reg; 422 423 unsigned int array_elements; 424 const glsl_type *type; 425 426 if (ir->type->is_array()) { 427 array_elements = ir->type->length; 428 if (array_elements == 0) { 429 fail("dereferenced array '%s' has length 0\n", ir->name); 430 } 431 type = ir->type->fields.array; 432 } else { 433 array_elements = 1; 434 type = ir->type; 435 } 436 437 int location = ir->location; 438 for (unsigned int i = 0; i < array_elements; i++) { 439 for (unsigned int j = 0; j < type->matrix_columns; j++) { 440 if (urb_setup[location] == -1) { 441 /* If there's no incoming setup data for this slot, don't 442 * emit interpolation for it. 443 */ 444 attr.reg_offset += type->vector_elements; 445 location++; 446 continue; 447 } 448 449 bool is_gl_Color = 450 location == FRAG_ATTRIB_COL0 || location == FRAG_ATTRIB_COL1; 451 452 if (c->key.flat_shade && is_gl_Color) { 453 /* Constant interpolation (flat shading) case. The SF has 454 * handed us defined values in only the constant offset 455 * field of the setup reg. 456 */ 457 for (unsigned int k = 0; k < type->vector_elements; k++) { 458 struct brw_reg interp = interp_reg(location, k); 459 interp = suboffset(interp, 3); 460 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp)); 461 attr.reg_offset++; 462 } 463 } else { 464 /* Perspective interpolation case. */ 465 for (unsigned int k = 0; k < type->vector_elements; k++) { 466 struct brw_reg interp = interp_reg(location, k); 467 emit(FS_OPCODE_LINTERP, attr, 468 this->delta_x, this->delta_y, fs_reg(interp)); 469 attr.reg_offset++; 470 } 471 472 if (intel->gen < 6) { 473 attr.reg_offset -= type->vector_elements; 474 for (unsigned int k = 0; k < type->vector_elements; k++) { 475 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w); 476 attr.reg_offset++; 477 } 478 } 479 } 480 location++; 481 } 482 } 483 484 return reg; 485} 486 487fs_reg * 488fs_visitor::emit_frontfacing_interpolation(ir_variable *ir) 489{ 490 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 491 492 /* The frontfacing comes in as a bit in the thread payload. */ 493 if (intel->gen >= 6) { 494 emit(BRW_OPCODE_ASR, *reg, 495 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)), 496 fs_reg(15)); 497 emit(BRW_OPCODE_NOT, *reg, *reg); 498 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1)); 499 } else { 500 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); 501 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives 502 * us front face 503 */ 504 fs_inst *inst = emit(BRW_OPCODE_CMP, *reg, 505 fs_reg(r1_6ud), 506 fs_reg(1u << 31)); 507 inst->conditional_mod = BRW_CONDITIONAL_L; 508 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u)); 509 } 510 511 return reg; 512} 513 514fs_inst * 515fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src) 516{ 517 switch (opcode) { 518 case FS_OPCODE_RCP: 519 case FS_OPCODE_RSQ: 520 case FS_OPCODE_SQRT: 521 case FS_OPCODE_EXP2: 522 case FS_OPCODE_LOG2: 523 case FS_OPCODE_SIN: 524 case FS_OPCODE_COS: 525 break; 526 default: 527 assert(!"not reached: bad math opcode"); 528 return NULL; 529 } 530 531 /* Can't do hstride == 0 args to gen6 math, so expand it out. We 532 * might be able to do better by doing execsize = 1 math and then 533 * expanding that result out, but we would need to be careful with 534 * masking. 535 * 536 * The hardware ignores source modifiers (negate and abs) on math 537 * instructions, so we also move to a temp to set those up. 538 */ 539 if (intel->gen >= 6 && (src.file == UNIFORM || 540 src.abs || 541 src.negate)) { 542 fs_reg expanded = fs_reg(this, glsl_type::float_type); 543 emit(BRW_OPCODE_MOV, expanded, src); 544 src = expanded; 545 } 546 547 fs_inst *inst = emit(opcode, dst, src); 548 549 if (intel->gen < 6) { 550 inst->base_mrf = 2; 551 inst->mlen = c->dispatch_width / 8; 552 } 553 554 return inst; 555} 556 557fs_inst * 558fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1) 559{ 560 int base_mrf = 2; 561 fs_inst *inst; 562 563 assert(opcode == FS_OPCODE_POW); 564 565 if (intel->gen >= 6) { 566 /* Can't do hstride == 0 args to gen6 math, so expand it out. 567 * 568 * The hardware ignores source modifiers (negate and abs) on math 569 * instructions, so we also move to a temp to set those up. 570 */ 571 if (src0.file == UNIFORM || src0.abs || src0.negate) { 572 fs_reg expanded = fs_reg(this, glsl_type::float_type); 573 emit(BRW_OPCODE_MOV, expanded, src0); 574 src0 = expanded; 575 } 576 577 if (src1.file == UNIFORM || src1.abs || src1.negate) { 578 fs_reg expanded = fs_reg(this, glsl_type::float_type); 579 emit(BRW_OPCODE_MOV, expanded, src1); 580 src1 = expanded; 581 } 582 583 inst = emit(opcode, dst, src0, src1); 584 } else { 585 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1); 586 inst = emit(opcode, dst, src0, reg_null_f); 587 588 inst->base_mrf = base_mrf; 589 inst->mlen = 2 * c->dispatch_width / 8; 590 } 591 return inst; 592} 593 594/** 595 * To be called after the last _mesa_add_state_reference() call, to 596 * set up prog_data.param[] for assign_curb_setup() and 597 * setup_pull_constants(). 598 */ 599void 600fs_visitor::setup_paramvalues_refs() 601{ 602 if (c->dispatch_width != 8) 603 return; 604 605 /* Set up the pointers to ParamValues now that that array is finalized. */ 606 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 607 c->prog_data.param[i] = 608 fp->Base.Parameters->ParameterValues[this->param_index[i]] + 609 this->param_offset[i]; 610 } 611} 612 613void 614fs_visitor::assign_curb_setup() 615{ 616 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8; 617 if (c->dispatch_width == 8) { 618 c->prog_data.first_curbe_grf = c->nr_payload_regs; 619 } else { 620 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs; 621 } 622 623 /* Map the offsets in the UNIFORM file to fixed HW regs. */ 624 foreach_iter(exec_list_iterator, iter, this->instructions) { 625 fs_inst *inst = (fs_inst *)iter.get(); 626 627 for (unsigned int i = 0; i < 3; i++) { 628 if (inst->src[i].file == UNIFORM) { 629 int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 630 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs + 631 constant_nr / 8, 632 constant_nr % 8); 633 634 inst->src[i].file = FIXED_HW_REG; 635 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type); 636 } 637 } 638 } 639} 640 641void 642fs_visitor::calculate_urb_setup() 643{ 644 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 645 urb_setup[i] = -1; 646 } 647 648 int urb_next = 0; 649 /* Figure out where each of the incoming setup attributes lands. */ 650 if (intel->gen >= 6) { 651 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 652 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) { 653 urb_setup[i] = urb_next++; 654 } 655 } 656 } else { 657 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */ 658 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) { 659 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) { 660 int fp_index; 661 662 if (i >= VERT_RESULT_VAR0) 663 fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0); 664 else if (i <= VERT_RESULT_TEX7) 665 fp_index = i; 666 else 667 fp_index = -1; 668 669 if (fp_index >= 0) 670 urb_setup[fp_index] = urb_next++; 671 } 672 } 673 } 674 675 /* Each attribute is 4 setup channels, each of which is half a reg. */ 676 c->prog_data.urb_read_length = urb_next * 2; 677} 678 679void 680fs_visitor::assign_urb_setup() 681{ 682 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length; 683 684 /* Offset all the urb_setup[] index by the actual position of the 685 * setup regs, now that the location of the constants has been chosen. 686 */ 687 foreach_iter(exec_list_iterator, iter, this->instructions) { 688 fs_inst *inst = (fs_inst *)iter.get(); 689 690 if (inst->opcode == FS_OPCODE_LINTERP) { 691 assert(inst->src[2].file == FIXED_HW_REG); 692 inst->src[2].fixed_hw_reg.nr += urb_start; 693 } 694 695 if (inst->opcode == FS_OPCODE_CINTERP) { 696 assert(inst->src[0].file == FIXED_HW_REG); 697 inst->src[0].fixed_hw_reg.nr += urb_start; 698 } 699 } 700 701 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length; 702} 703 704/** 705 * Split large virtual GRFs into separate components if we can. 706 * 707 * This is mostly duplicated with what brw_fs_vector_splitting does, 708 * but that's really conservative because it's afraid of doing 709 * splitting that doesn't result in real progress after the rest of 710 * the optimization phases, which would cause infinite looping in 711 * optimization. We can do it once here, safely. This also has the 712 * opportunity to split interpolated values, or maybe even uniforms, 713 * which we don't have at the IR level. 714 * 715 * We want to split, because virtual GRFs are what we register 716 * allocate and spill (due to contiguousness requirements for some 717 * instructions), and they're what we naturally generate in the 718 * codegen process, but most virtual GRFs don't actually need to be 719 * contiguous sets of GRFs. If we split, we'll end up with reduced 720 * live intervals and better dead code elimination and coalescing. 721 */ 722void 723fs_visitor::split_virtual_grfs() 724{ 725 int num_vars = this->virtual_grf_next; 726 bool split_grf[num_vars]; 727 int new_virtual_grf[num_vars]; 728 729 /* Try to split anything > 0 sized. */ 730 for (int i = 0; i < num_vars; i++) { 731 if (this->virtual_grf_sizes[i] != 1) 732 split_grf[i] = true; 733 else 734 split_grf[i] = false; 735 } 736 737 if (brw->has_pln) { 738 /* PLN opcodes rely on the delta_xy being contiguous. */ 739 split_grf[this->delta_x.reg] = false; 740 } 741 742 foreach_iter(exec_list_iterator, iter, this->instructions) { 743 fs_inst *inst = (fs_inst *)iter.get(); 744 745 /* Texturing produces 4 contiguous registers, so no splitting. */ 746 if (inst->is_tex()) { 747 split_grf[inst->dst.reg] = false; 748 } 749 } 750 751 /* Allocate new space for split regs. Note that the virtual 752 * numbers will be contiguous. 753 */ 754 for (int i = 0; i < num_vars; i++) { 755 if (split_grf[i]) { 756 new_virtual_grf[i] = virtual_grf_alloc(1); 757 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) { 758 int reg = virtual_grf_alloc(1); 759 assert(reg == new_virtual_grf[i] + j - 1); 760 (void) reg; 761 } 762 this->virtual_grf_sizes[i] = 1; 763 } 764 } 765 766 foreach_iter(exec_list_iterator, iter, this->instructions) { 767 fs_inst *inst = (fs_inst *)iter.get(); 768 769 if (inst->dst.file == GRF && 770 split_grf[inst->dst.reg] && 771 inst->dst.reg_offset != 0) { 772 inst->dst.reg = (new_virtual_grf[inst->dst.reg] + 773 inst->dst.reg_offset - 1); 774 inst->dst.reg_offset = 0; 775 } 776 for (int i = 0; i < 3; i++) { 777 if (inst->src[i].file == GRF && 778 split_grf[inst->src[i].reg] && 779 inst->src[i].reg_offset != 0) { 780 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] + 781 inst->src[i].reg_offset - 1); 782 inst->src[i].reg_offset = 0; 783 } 784 } 785 } 786 this->live_intervals_valid = false; 787} 788 789/** 790 * Choose accesses from the UNIFORM file to demote to using the pull 791 * constant buffer. 792 * 793 * We allow a fragment shader to have more than the specified minimum 794 * maximum number of fragment shader uniform components (64). If 795 * there are too many of these, they'd fill up all of register space. 796 * So, this will push some of them out to the pull constant buffer and 797 * update the program to load them. 798 */ 799void 800fs_visitor::setup_pull_constants() 801{ 802 /* Only allow 16 registers (128 uniform components) as push constants. */ 803 unsigned int max_uniform_components = 16 * 8; 804 if (c->prog_data.nr_params <= max_uniform_components) 805 return; 806 807 if (c->dispatch_width == 16) { 808 fail("Pull constants not supported in 16-wide\n"); 809 return; 810 } 811 812 /* Just demote the end of the list. We could probably do better 813 * here, demoting things that are rarely used in the program first. 814 */ 815 int pull_uniform_base = max_uniform_components; 816 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base; 817 818 foreach_iter(exec_list_iterator, iter, this->instructions) { 819 fs_inst *inst = (fs_inst *)iter.get(); 820 821 for (int i = 0; i < 3; i++) { 822 if (inst->src[i].file != UNIFORM) 823 continue; 824 825 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 826 if (uniform_nr < pull_uniform_base) 827 continue; 828 829 fs_reg dst = fs_reg(this, glsl_type::float_type); 830 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD, 831 dst); 832 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15; 833 pull->ir = inst->ir; 834 pull->annotation = inst->annotation; 835 pull->base_mrf = 14; 836 pull->mlen = 1; 837 838 inst->insert_before(pull); 839 840 inst->src[i].file = GRF; 841 inst->src[i].reg = dst.reg; 842 inst->src[i].reg_offset = 0; 843 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3; 844 } 845 } 846 847 for (int i = 0; i < pull_uniform_count; i++) { 848 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i]; 849 c->prog_data.pull_param_convert[i] = 850 c->prog_data.param_convert[pull_uniform_base + i]; 851 } 852 c->prog_data.nr_params -= pull_uniform_count; 853 c->prog_data.nr_pull_params = pull_uniform_count; 854} 855 856void 857fs_visitor::calculate_live_intervals() 858{ 859 int num_vars = this->virtual_grf_next; 860 int *def = ralloc_array(mem_ctx, int, num_vars); 861 int *use = ralloc_array(mem_ctx, int, num_vars); 862 int loop_depth = 0; 863 int loop_start = 0; 864 865 if (this->live_intervals_valid) 866 return; 867 868 for (int i = 0; i < num_vars; i++) { 869 def[i] = MAX_INSTRUCTION; 870 use[i] = -1; 871 } 872 873 int ip = 0; 874 foreach_iter(exec_list_iterator, iter, this->instructions) { 875 fs_inst *inst = (fs_inst *)iter.get(); 876 877 if (inst->opcode == BRW_OPCODE_DO) { 878 if (loop_depth++ == 0) 879 loop_start = ip; 880 } else if (inst->opcode == BRW_OPCODE_WHILE) { 881 loop_depth--; 882 883 if (loop_depth == 0) { 884 /* Patches up the use of vars marked for being live across 885 * the whole loop. 886 */ 887 for (int i = 0; i < num_vars; i++) { 888 if (use[i] == loop_start) { 889 use[i] = ip; 890 } 891 } 892 } 893 } else { 894 for (unsigned int i = 0; i < 3; i++) { 895 if (inst->src[i].file == GRF && inst->src[i].reg != 0) { 896 int reg = inst->src[i].reg; 897 898 if (!loop_depth) { 899 use[reg] = ip; 900 } else { 901 def[reg] = MIN2(loop_start, def[reg]); 902 use[reg] = loop_start; 903 904 /* Nobody else is going to go smash our start to 905 * later in the loop now, because def[reg] now 906 * points before the bb header. 907 */ 908 } 909 } 910 } 911 if (inst->dst.file == GRF && inst->dst.reg != 0) { 912 int reg = inst->dst.reg; 913 914 if (!loop_depth) { 915 def[reg] = MIN2(def[reg], ip); 916 } else { 917 def[reg] = MIN2(def[reg], loop_start); 918 } 919 } 920 } 921 922 ip++; 923 } 924 925 ralloc_free(this->virtual_grf_def); 926 ralloc_free(this->virtual_grf_use); 927 this->virtual_grf_def = def; 928 this->virtual_grf_use = use; 929 930 this->live_intervals_valid = true; 931} 932 933/** 934 * Attempts to move immediate constants into the immediate 935 * constant slot of following instructions. 936 * 937 * Immediate constants are a bit tricky -- they have to be in the last 938 * operand slot, you can't do abs/negate on them, 939 */ 940 941bool 942fs_visitor::propagate_constants() 943{ 944 bool progress = false; 945 946 calculate_live_intervals(); 947 948 foreach_iter(exec_list_iterator, iter, this->instructions) { 949 fs_inst *inst = (fs_inst *)iter.get(); 950 951 if (inst->opcode != BRW_OPCODE_MOV || 952 inst->predicated || 953 inst->dst.file != GRF || inst->src[0].file != IMM || 954 inst->dst.type != inst->src[0].type || 955 (c->dispatch_width == 16 && 956 (inst->force_uncompressed || inst->force_sechalf))) 957 continue; 958 959 /* Don't bother with cases where we should have had the 960 * operation on the constant folded in GLSL already. 961 */ 962 if (inst->saturate) 963 continue; 964 965 /* Found a move of a constant to a GRF. Find anything else using the GRF 966 * before it's written, and replace it with the constant if we can. 967 */ 968 exec_list_iterator scan_iter = iter; 969 scan_iter.next(); 970 for (; scan_iter.has_next(); scan_iter.next()) { 971 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 972 973 if (scan_inst->opcode == BRW_OPCODE_DO || 974 scan_inst->opcode == BRW_OPCODE_WHILE || 975 scan_inst->opcode == BRW_OPCODE_ELSE || 976 scan_inst->opcode == BRW_OPCODE_ENDIF) { 977 break; 978 } 979 980 for (int i = 2; i >= 0; i--) { 981 if (scan_inst->src[i].file != GRF || 982 scan_inst->src[i].reg != inst->dst.reg || 983 scan_inst->src[i].reg_offset != inst->dst.reg_offset) 984 continue; 985 986 /* Don't bother with cases where we should have had the 987 * operation on the constant folded in GLSL already. 988 */ 989 if (scan_inst->src[i].negate || scan_inst->src[i].abs) 990 continue; 991 992 switch (scan_inst->opcode) { 993 case BRW_OPCODE_MOV: 994 scan_inst->src[i] = inst->src[0]; 995 progress = true; 996 break; 997 998 case BRW_OPCODE_MUL: 999 case BRW_OPCODE_ADD: 1000 if (i == 1) { 1001 scan_inst->src[i] = inst->src[0]; 1002 progress = true; 1003 } else if (i == 0 && scan_inst->src[1].file != IMM) { 1004 /* Fit this constant in by commuting the operands */ 1005 scan_inst->src[0] = scan_inst->src[1]; 1006 scan_inst->src[1] = inst->src[0]; 1007 progress = true; 1008 } 1009 break; 1010 1011 case BRW_OPCODE_CMP: 1012 if (i == 1) { 1013 scan_inst->src[i] = inst->src[0]; 1014 progress = true; 1015 } else if (i == 0 && scan_inst->src[1].file != IMM) { 1016 uint32_t new_cmod; 1017 1018 new_cmod = brw_swap_cmod(scan_inst->conditional_mod); 1019 if (new_cmod != ~0u) { 1020 /* Fit this constant in by swapping the operands and 1021 * flipping the test 1022 */ 1023 scan_inst->src[0] = scan_inst->src[1]; 1024 scan_inst->src[1] = inst->src[0]; 1025 scan_inst->conditional_mod = new_cmod; 1026 progress = true; 1027 } 1028 } 1029 break; 1030 1031 case BRW_OPCODE_SEL: 1032 if (i == 1) { 1033 scan_inst->src[i] = inst->src[0]; 1034 progress = true; 1035 } else if (i == 0 && scan_inst->src[1].file != IMM) { 1036 scan_inst->src[0] = scan_inst->src[1]; 1037 scan_inst->src[1] = inst->src[0]; 1038 1039 /* If this was predicated, flipping operands means 1040 * we also need to flip the predicate. 1041 */ 1042 if (scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) { 1043 scan_inst->predicate_inverse = 1044 !scan_inst->predicate_inverse; 1045 } 1046 progress = true; 1047 } 1048 break; 1049 } 1050 } 1051 1052 if (scan_inst->dst.file == GRF && 1053 scan_inst->dst.reg == inst->dst.reg && 1054 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 1055 scan_inst->is_tex())) { 1056 break; 1057 } 1058 } 1059 } 1060 1061 if (progress) 1062 this->live_intervals_valid = false; 1063 1064 return progress; 1065} 1066/** 1067 * Must be called after calculate_live_intervales() to remove unused 1068 * writes to registers -- register allocation will fail otherwise 1069 * because something deffed but not used won't be considered to 1070 * interfere with other regs. 1071 */ 1072bool 1073fs_visitor::dead_code_eliminate() 1074{ 1075 bool progress = false; 1076 int pc = 0; 1077 1078 calculate_live_intervals(); 1079 1080 foreach_iter(exec_list_iterator, iter, this->instructions) { 1081 fs_inst *inst = (fs_inst *)iter.get(); 1082 1083 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) { 1084 inst->remove(); 1085 progress = true; 1086 } 1087 1088 pc++; 1089 } 1090 1091 if (progress) 1092 live_intervals_valid = false; 1093 1094 return progress; 1095} 1096 1097bool 1098fs_visitor::register_coalesce() 1099{ 1100 bool progress = false; 1101 int if_depth = 0; 1102 int loop_depth = 0; 1103 1104 foreach_iter(exec_list_iterator, iter, this->instructions) { 1105 fs_inst *inst = (fs_inst *)iter.get(); 1106 1107 /* Make sure that we dominate the instructions we're going to 1108 * scan for interfering with our coalescing, or we won't have 1109 * scanned enough to see if anything interferes with our 1110 * coalescing. We don't dominate the following instructions if 1111 * we're in a loop or an if block. 1112 */ 1113 switch (inst->opcode) { 1114 case BRW_OPCODE_DO: 1115 loop_depth++; 1116 break; 1117 case BRW_OPCODE_WHILE: 1118 loop_depth--; 1119 break; 1120 case BRW_OPCODE_IF: 1121 if_depth++; 1122 break; 1123 case BRW_OPCODE_ENDIF: 1124 if_depth--; 1125 break; 1126 } 1127 if (loop_depth || if_depth) 1128 continue; 1129 1130 if (inst->opcode != BRW_OPCODE_MOV || 1131 inst->predicated || 1132 inst->saturate || 1133 inst->dst.file != GRF || inst->src[0].file != GRF || 1134 inst->dst.type != inst->src[0].type) 1135 continue; 1136 1137 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate; 1138 1139 /* Found a move of a GRF to a GRF. Let's see if we can coalesce 1140 * them: check for no writes to either one until the exit of the 1141 * program. 1142 */ 1143 bool interfered = false; 1144 exec_list_iterator scan_iter = iter; 1145 scan_iter.next(); 1146 for (; scan_iter.has_next(); scan_iter.next()) { 1147 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 1148 1149 if (scan_inst->dst.file == GRF) { 1150 if (scan_inst->dst.reg == inst->dst.reg && 1151 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 1152 scan_inst->is_tex())) { 1153 interfered = true; 1154 break; 1155 } 1156 if (scan_inst->dst.reg == inst->src[0].reg && 1157 (scan_inst->dst.reg_offset == inst->src[0].reg_offset || 1158 scan_inst->is_tex())) { 1159 interfered = true; 1160 break; 1161 } 1162 } 1163 1164 /* The gen6 MATH instruction can't handle source modifiers, so avoid 1165 * coalescing those for now. We should do something more specific. 1166 */ 1167 if (intel->gen >= 6 && scan_inst->is_math() && has_source_modifiers) { 1168 interfered = true; 1169 break; 1170 } 1171 } 1172 if (interfered) { 1173 continue; 1174 } 1175 1176 /* Rewrite the later usage to point at the source of the move to 1177 * be removed. 1178 */ 1179 for (exec_list_iterator scan_iter = iter; scan_iter.has_next(); 1180 scan_iter.next()) { 1181 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 1182 1183 for (int i = 0; i < 3; i++) { 1184 if (scan_inst->src[i].file == GRF && 1185 scan_inst->src[i].reg == inst->dst.reg && 1186 scan_inst->src[i].reg_offset == inst->dst.reg_offset) { 1187 scan_inst->src[i].reg = inst->src[0].reg; 1188 scan_inst->src[i].reg_offset = inst->src[0].reg_offset; 1189 scan_inst->src[i].abs |= inst->src[0].abs; 1190 scan_inst->src[i].negate ^= inst->src[0].negate; 1191 scan_inst->src[i].smear = inst->src[0].smear; 1192 } 1193 } 1194 } 1195 1196 inst->remove(); 1197 progress = true; 1198 } 1199 1200 if (progress) 1201 live_intervals_valid = false; 1202 1203 return progress; 1204} 1205 1206 1207bool 1208fs_visitor::compute_to_mrf() 1209{ 1210 bool progress = false; 1211 int next_ip = 0; 1212 1213 calculate_live_intervals(); 1214 1215 foreach_iter(exec_list_iterator, iter, this->instructions) { 1216 fs_inst *inst = (fs_inst *)iter.get(); 1217 1218 int ip = next_ip; 1219 next_ip++; 1220 1221 if (inst->opcode != BRW_OPCODE_MOV || 1222 inst->predicated || 1223 inst->dst.file != MRF || inst->src[0].file != GRF || 1224 inst->dst.type != inst->src[0].type || 1225 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1) 1226 continue; 1227 1228 /* Work out which hardware MRF registers are written by this 1229 * instruction. 1230 */ 1231 int mrf_low = inst->dst.hw_reg & ~BRW_MRF_COMPR4; 1232 int mrf_high; 1233 if (inst->dst.hw_reg & BRW_MRF_COMPR4) { 1234 mrf_high = mrf_low + 4; 1235 } else if (c->dispatch_width == 16 && 1236 (!inst->force_uncompressed && !inst->force_sechalf)) { 1237 mrf_high = mrf_low + 1; 1238 } else { 1239 mrf_high = mrf_low; 1240 } 1241 1242 /* Can't compute-to-MRF this GRF if someone else was going to 1243 * read it later. 1244 */ 1245 if (this->virtual_grf_use[inst->src[0].reg] > ip) 1246 continue; 1247 1248 /* Found a move of a GRF to a MRF. Let's see if we can go 1249 * rewrite the thing that made this GRF to write into the MRF. 1250 */ 1251 fs_inst *scan_inst; 1252 for (scan_inst = (fs_inst *)inst->prev; 1253 scan_inst->prev != NULL; 1254 scan_inst = (fs_inst *)scan_inst->prev) { 1255 if (scan_inst->dst.file == GRF && 1256 scan_inst->dst.reg == inst->src[0].reg) { 1257 /* Found the last thing to write our reg we want to turn 1258 * into a compute-to-MRF. 1259 */ 1260 1261 if (scan_inst->is_tex()) { 1262 /* texturing writes several continuous regs, so we can't 1263 * compute-to-mrf that. 1264 */ 1265 break; 1266 } 1267 1268 /* If it's predicated, it (probably) didn't populate all 1269 * the channels. We might be able to rewrite everything 1270 * that writes that reg, but it would require smarter 1271 * tracking to delay the rewriting until complete success. 1272 */ 1273 if (scan_inst->predicated) 1274 break; 1275 1276 /* If it's half of register setup and not the same half as 1277 * our MOV we're trying to remove, bail for now. 1278 */ 1279 if (scan_inst->force_uncompressed != inst->force_uncompressed || 1280 scan_inst->force_sechalf != inst->force_sechalf) { 1281 break; 1282 } 1283 1284 /* SEND instructions can't have MRF as a destination. */ 1285 if (scan_inst->mlen) 1286 break; 1287 1288 if (intel->gen >= 6) { 1289 /* gen6 math instructions must have the destination be 1290 * GRF, so no compute-to-MRF for them. 1291 */ 1292 if (scan_inst->is_math()) { 1293 break; 1294 } 1295 } 1296 1297 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 1298 /* Found the creator of our MRF's source value. */ 1299 scan_inst->dst.file = MRF; 1300 scan_inst->dst.hw_reg = inst->dst.hw_reg; 1301 scan_inst->saturate |= inst->saturate; 1302 inst->remove(); 1303 progress = true; 1304 } 1305 break; 1306 } 1307 1308 /* We don't handle flow control here. Most computation of 1309 * values that end up in MRFs are shortly before the MRF 1310 * write anyway. 1311 */ 1312 if (scan_inst->opcode == BRW_OPCODE_DO || 1313 scan_inst->opcode == BRW_OPCODE_WHILE || 1314 scan_inst->opcode == BRW_OPCODE_ELSE || 1315 scan_inst->opcode == BRW_OPCODE_ENDIF) { 1316 break; 1317 } 1318 1319 /* You can't read from an MRF, so if someone else reads our 1320 * MRF's source GRF that we wanted to rewrite, that stops us. 1321 */ 1322 bool interfered = false; 1323 for (int i = 0; i < 3; i++) { 1324 if (scan_inst->src[i].file == GRF && 1325 scan_inst->src[i].reg == inst->src[0].reg && 1326 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { 1327 interfered = true; 1328 } 1329 } 1330 if (interfered) 1331 break; 1332 1333 if (scan_inst->dst.file == MRF) { 1334 /* If somebody else writes our MRF here, we can't 1335 * compute-to-MRF before that. 1336 */ 1337 int scan_mrf_low = scan_inst->dst.hw_reg & ~BRW_MRF_COMPR4; 1338 int scan_mrf_high; 1339 1340 if (scan_inst->dst.hw_reg & BRW_MRF_COMPR4) { 1341 scan_mrf_high = scan_mrf_low + 4; 1342 } else if (c->dispatch_width == 16 && 1343 (!scan_inst->force_uncompressed && 1344 !scan_inst->force_sechalf)) { 1345 scan_mrf_high = scan_mrf_low + 1; 1346 } else { 1347 scan_mrf_high = scan_mrf_low; 1348 } 1349 1350 if (mrf_low == scan_mrf_low || 1351 mrf_low == scan_mrf_high || 1352 mrf_high == scan_mrf_low || 1353 mrf_high == scan_mrf_high) { 1354 break; 1355 } 1356 } 1357 1358 if (scan_inst->mlen > 0) { 1359 /* Found a SEND instruction, which means that there are 1360 * live values in MRFs from base_mrf to base_mrf + 1361 * scan_inst->mlen - 1. Don't go pushing our MRF write up 1362 * above it. 1363 */ 1364 if (mrf_low >= scan_inst->base_mrf && 1365 mrf_low < scan_inst->base_mrf + scan_inst->mlen) { 1366 break; 1367 } 1368 if (mrf_high >= scan_inst->base_mrf && 1369 mrf_high < scan_inst->base_mrf + scan_inst->mlen) { 1370 break; 1371 } 1372 } 1373 } 1374 } 1375 1376 return progress; 1377} 1378 1379/** 1380 * Walks through basic blocks, locking for repeated MRF writes and 1381 * removing the later ones. 1382 */ 1383bool 1384fs_visitor::remove_duplicate_mrf_writes() 1385{ 1386 fs_inst *last_mrf_move[16]; 1387 bool progress = false; 1388 1389 /* Need to update the MRF tracking for compressed instructions. */ 1390 if (c->dispatch_width == 16) 1391 return false; 1392 1393 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 1394 1395 foreach_iter(exec_list_iterator, iter, this->instructions) { 1396 fs_inst *inst = (fs_inst *)iter.get(); 1397 1398 switch (inst->opcode) { 1399 case BRW_OPCODE_DO: 1400 case BRW_OPCODE_WHILE: 1401 case BRW_OPCODE_IF: 1402 case BRW_OPCODE_ELSE: 1403 case BRW_OPCODE_ENDIF: 1404 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 1405 continue; 1406 default: 1407 break; 1408 } 1409 1410 if (inst->opcode == BRW_OPCODE_MOV && 1411 inst->dst.file == MRF) { 1412 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg]; 1413 if (prev_inst && inst->equals(prev_inst)) { 1414 inst->remove(); 1415 progress = true; 1416 continue; 1417 } 1418 } 1419 1420 /* Clear out the last-write records for MRFs that were overwritten. */ 1421 if (inst->dst.file == MRF) { 1422 last_mrf_move[inst->dst.hw_reg] = NULL; 1423 } 1424 1425 if (inst->mlen > 0) { 1426 /* Found a SEND instruction, which will include two or fewer 1427 * implied MRF writes. We could do better here. 1428 */ 1429 for (int i = 0; i < implied_mrf_writes(inst); i++) { 1430 last_mrf_move[inst->base_mrf + i] = NULL; 1431 } 1432 } 1433 1434 /* Clear out any MRF move records whose sources got overwritten. */ 1435 if (inst->dst.file == GRF) { 1436 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) { 1437 if (last_mrf_move[i] && 1438 last_mrf_move[i]->src[0].reg == inst->dst.reg) { 1439 last_mrf_move[i] = NULL; 1440 } 1441 } 1442 } 1443 1444 if (inst->opcode == BRW_OPCODE_MOV && 1445 inst->dst.file == MRF && 1446 inst->src[0].file == GRF && 1447 !inst->predicated) { 1448 last_mrf_move[inst->dst.hw_reg] = inst; 1449 } 1450 } 1451 1452 return progress; 1453} 1454 1455bool 1456fs_visitor::virtual_grf_interferes(int a, int b) 1457{ 1458 int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]); 1459 int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]); 1460 1461 /* We can't handle dead register writes here, without iterating 1462 * over the whole instruction stream to find every single dead 1463 * write to that register to compare to the live interval of the 1464 * other register. Just assert that dead_code_eliminate() has been 1465 * called. 1466 */ 1467 assert((this->virtual_grf_use[a] != -1 || 1468 this->virtual_grf_def[a] == MAX_INSTRUCTION) && 1469 (this->virtual_grf_use[b] != -1 || 1470 this->virtual_grf_def[b] == MAX_INSTRUCTION)); 1471 1472 /* If the register is used to store 16 values of less than float 1473 * size (only the case for pixel_[xy]), then we can't allocate 1474 * another dword-sized thing to that register that would be used in 1475 * the same instruction. This is because when the GPU decodes (for 1476 * example): 1477 * 1478 * (declare (in ) vec4 gl_FragCoord@0x97766a0) 1479 * add(16) g6<1>F g6<8,8,1>UW 0.5F { align1 compr }; 1480 * 1481 * it's actually processed as: 1482 * add(8) g6<1>F g6<8,8,1>UW 0.5F { align1 }; 1483 * add(8) g7<1>F g6.8<8,8,1>UW 0.5F { align1 sechalf }; 1484 * 1485 * so our second half values in g6 got overwritten in the first 1486 * half. 1487 */ 1488 if (c->dispatch_width == 16 && (this->pixel_x.reg == a || 1489 this->pixel_x.reg == b || 1490 this->pixel_y.reg == a || 1491 this->pixel_y.reg == b)) { 1492 return start <= end; 1493 } 1494 1495 return start < end; 1496} 1497 1498bool 1499fs_visitor::run() 1500{ 1501 uint32_t prog_offset_16 = 0; 1502 uint32_t orig_nr_params = c->prog_data.nr_params; 1503 1504 brw_wm_payload_setup(brw, c); 1505 1506 if (c->dispatch_width == 16) { 1507 /* align to 64 byte boundary. */ 1508 while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) { 1509 brw_NOP(p); 1510 } 1511 1512 /* Save off the start of this 16-wide program in case we succeed. */ 1513 prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction); 1514 1515 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 1516 } 1517 1518 if (0) { 1519 emit_dummy_fs(); 1520 } else { 1521 calculate_urb_setup(); 1522 if (intel->gen < 6) 1523 emit_interpolation_setup_gen4(); 1524 else 1525 emit_interpolation_setup_gen6(); 1526 1527 /* Generate FS IR for main(). (the visitor only descends into 1528 * functions called "main"). 1529 */ 1530 foreach_iter(exec_list_iterator, iter, *shader->ir) { 1531 ir_instruction *ir = (ir_instruction *)iter.get(); 1532 base_ir = ir; 1533 this->result = reg_undef; 1534 ir->accept(this); 1535 } 1536 if (failed) 1537 return false; 1538 1539 emit_fb_writes(); 1540 1541 split_virtual_grfs(); 1542 1543 setup_paramvalues_refs(); 1544 setup_pull_constants(); 1545 1546 bool progress; 1547 do { 1548 progress = false; 1549 1550 progress = remove_duplicate_mrf_writes() || progress; 1551 1552 progress = propagate_constants() || progress; 1553 progress = register_coalesce() || progress; 1554 progress = compute_to_mrf() || progress; 1555 progress = dead_code_eliminate() || progress; 1556 } while (progress); 1557 1558 schedule_instructions(); 1559 1560 assign_curb_setup(); 1561 assign_urb_setup(); 1562 1563 if (0) { 1564 /* Debug of register spilling: Go spill everything. */ 1565 int virtual_grf_count = virtual_grf_next; 1566 for (int i = 1; i < virtual_grf_count; i++) { 1567 spill_reg(i); 1568 } 1569 } 1570 1571 if (0) 1572 assign_regs_trivial(); 1573 else { 1574 while (!assign_regs()) { 1575 if (failed) 1576 break; 1577 } 1578 } 1579 } 1580 assert(force_uncompressed_stack == 0); 1581 assert(force_sechalf_stack == 0); 1582 1583 if (failed) 1584 return false; 1585 1586 generate_code(); 1587 1588 if (c->dispatch_width == 8) { 1589 c->prog_data.reg_blocks = brw_register_blocks(grf_used); 1590 } else { 1591 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used); 1592 c->prog_data.prog_offset_16 = prog_offset_16; 1593 1594 /* Make sure we didn't try to sneak in an extra uniform */ 1595 assert(orig_nr_params == c->prog_data.nr_params); 1596 } 1597 1598 return !failed; 1599} 1600 1601bool 1602brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c, 1603 struct gl_shader_program *prog) 1604{ 1605 struct intel_context *intel = &brw->intel; 1606 1607 if (!prog) 1608 return false; 1609 1610 struct brw_shader *shader = 1611 (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; 1612 if (!shader) 1613 return false; 1614 1615 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 1616 printf("GLSL IR for native fragment shader %d:\n", prog->Name); 1617 _mesa_print_ir(shader->ir, NULL); 1618 printf("\n\n"); 1619 } 1620 1621 /* Now the main event: Visit the shader IR and generate our FS IR for it. 1622 */ 1623 c->dispatch_width = 8; 1624 1625 fs_visitor v(c, prog, shader); 1626 if (!v.run()) { 1627 prog->LinkStatus = GL_FALSE; 1628 prog->InfoLog = ralloc_strdup(prog, v.fail_msg); 1629 1630 return false; 1631 } 1632 1633 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) { 1634 c->dispatch_width = 16; 1635 fs_visitor v2(c, prog, shader); 1636 v2.import_uniforms(v.variable_ht); 1637 v2.run(); 1638 } 1639 1640 c->prog_data.dispatch_width = 8; 1641 1642 return true; 1643} 1644 1645bool 1646brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog) 1647{ 1648 struct brw_context *brw = brw_context(ctx); 1649 struct brw_wm_prog_key key; 1650 struct gl_fragment_program *fp = prog->FragmentProgram; 1651 struct brw_fragment_program *bfp = brw_fragment_program(fp); 1652 1653 if (!fp) 1654 return true; 1655 1656 memset(&key, 0, sizeof(key)); 1657 1658 if (fp->UsesKill) 1659 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT; 1660 1661 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) 1662 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT; 1663 1664 /* Just assume depth testing. */ 1665 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT; 1666 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT; 1667 1668 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS); 1669 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) { 1670 int vp_index = -1; 1671 1672 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i))) 1673 continue; 1674 1675 key.proj_attrib_mask |= 1 << i; 1676 1677 if (i <= FRAG_ATTRIB_TEX7) 1678 vp_index = i; 1679 else if (i >= FRAG_ATTRIB_VAR0) 1680 vp_index = i - FRAG_ATTRIB_VAR0 + VERT_RESULT_VAR0; 1681 1682 if (vp_index >= 0) 1683 key.vp_outputs_written |= BITFIELD64_BIT(vp_index); 1684 } 1685 1686 key.clamp_fragment_color = true; 1687 1688 for (int i = 0; i < BRW_MAX_TEX_UNIT; i++) { 1689 if (fp->Base.ShadowSamplers & (1 << i)) 1690 key.compare_funcs[i] = GL_LESS; 1691 1692 /* FINISHME: depth compares might use (0,0,0,W) for example */ 1693 key.tex_swizzles[i] = SWIZZLE_XYZW; 1694 } 1695 1696 if (fp->Base.InputsRead & FRAG_BIT_WPOS) { 1697 key.drawable_height = ctx->DrawBuffer->Height; 1698 key.render_to_fbo = ctx->DrawBuffer->Name != 0; 1699 } 1700 1701 key.nr_color_regions = 1; 1702 1703 key.program_string_id = bfp->id; 1704 1705 uint32_t old_prog_offset = brw->wm.prog_offset; 1706 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data; 1707 1708 bool success = do_wm_prog(brw, prog, bfp, &key); 1709 1710 brw->wm.prog_offset = old_prog_offset; 1711 brw->wm.prog_data = old_prog_data; 1712 1713 return success; 1714} 1715