brw_fs.cpp revision 8752764076e5b3f052a57e0134424a37bf2e9164
1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24/** @file brw_fs.cpp 25 * 26 * This file drives the GLSL IR -> LIR translation, contains the 27 * optimizations on the LIR, and drives the generation of native code 28 * from the LIR. 29 */ 30 31extern "C" { 32 33#include <sys/types.h> 34 35#include "main/macros.h" 36#include "main/shaderobj.h" 37#include "main/uniforms.h" 38#include "program/prog_parameter.h" 39#include "program/prog_print.h" 40#include "program/register_allocate.h" 41#include "program/sampler.h" 42#include "program/hash_table.h" 43#include "brw_context.h" 44#include "brw_eu.h" 45#include "brw_wm.h" 46} 47#include "brw_shader.h" 48#include "brw_fs.h" 49#include "../glsl/glsl_types.h" 50#include "../glsl/ir_print_visitor.h" 51 52#define MAX_INSTRUCTION (1 << 30) 53 54int 55fs_visitor::type_size(const struct glsl_type *type) 56{ 57 unsigned int size, i; 58 59 switch (type->base_type) { 60 case GLSL_TYPE_UINT: 61 case GLSL_TYPE_INT: 62 case GLSL_TYPE_FLOAT: 63 case GLSL_TYPE_BOOL: 64 return type->components(); 65 case GLSL_TYPE_ARRAY: 66 return type_size(type->fields.array) * type->length; 67 case GLSL_TYPE_STRUCT: 68 size = 0; 69 for (i = 0; i < type->length; i++) { 70 size += type_size(type->fields.structure[i].type); 71 } 72 return size; 73 case GLSL_TYPE_SAMPLER: 74 /* Samplers take up no register space, since they're baked in at 75 * link time. 76 */ 77 return 0; 78 default: 79 assert(!"not reached"); 80 return 0; 81 } 82} 83 84void 85fs_visitor::fail(const char *format, ...) 86{ 87 va_list va; 88 char *msg; 89 90 if (failed) 91 return; 92 93 failed = true; 94 95 va_start(va, format); 96 msg = ralloc_vasprintf(mem_ctx, format, va); 97 va_end(va); 98 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg); 99 100 this->fail_msg = msg; 101 102 if (INTEL_DEBUG & DEBUG_WM) { 103 fprintf(stderr, msg); 104 } 105} 106 107void 108fs_visitor::push_force_uncompressed() 109{ 110 force_uncompressed_stack++; 111} 112 113void 114fs_visitor::pop_force_uncompressed() 115{ 116 force_uncompressed_stack--; 117 assert(force_uncompressed_stack >= 0); 118} 119 120void 121fs_visitor::push_force_sechalf() 122{ 123 force_sechalf_stack++; 124} 125 126void 127fs_visitor::pop_force_sechalf() 128{ 129 force_sechalf_stack--; 130 assert(force_sechalf_stack >= 0); 131} 132 133/** 134 * Returns how many MRFs an FS opcode will write over. 135 * 136 * Note that this is not the 0 or 1 implied writes in an actual gen 137 * instruction -- the FS opcodes often generate MOVs in addition. 138 */ 139int 140fs_visitor::implied_mrf_writes(fs_inst *inst) 141{ 142 if (inst->mlen == 0) 143 return 0; 144 145 switch (inst->opcode) { 146 case FS_OPCODE_RCP: 147 case FS_OPCODE_RSQ: 148 case FS_OPCODE_SQRT: 149 case FS_OPCODE_EXP2: 150 case FS_OPCODE_LOG2: 151 case FS_OPCODE_SIN: 152 case FS_OPCODE_COS: 153 return 1 * c->dispatch_width / 8; 154 case FS_OPCODE_POW: 155 return 2 * c->dispatch_width / 8; 156 case FS_OPCODE_TEX: 157 case FS_OPCODE_TXB: 158 case FS_OPCODE_TXD: 159 case FS_OPCODE_TXL: 160 return 1; 161 case FS_OPCODE_FB_WRITE: 162 return 2; 163 case FS_OPCODE_PULL_CONSTANT_LOAD: 164 case FS_OPCODE_UNSPILL: 165 return 1; 166 case FS_OPCODE_SPILL: 167 return 2; 168 default: 169 assert(!"not reached"); 170 return inst->mlen; 171 } 172} 173 174int 175fs_visitor::virtual_grf_alloc(int size) 176{ 177 if (virtual_grf_array_size <= virtual_grf_next) { 178 if (virtual_grf_array_size == 0) 179 virtual_grf_array_size = 16; 180 else 181 virtual_grf_array_size *= 2; 182 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int, 183 virtual_grf_array_size); 184 185 /* This slot is always unused. */ 186 virtual_grf_sizes[0] = 0; 187 } 188 virtual_grf_sizes[virtual_grf_next] = size; 189 return virtual_grf_next++; 190} 191 192/** Fixed HW reg constructor. */ 193fs_reg::fs_reg(enum register_file file, int hw_reg) 194{ 195 init(); 196 this->file = file; 197 this->hw_reg = hw_reg; 198 this->type = BRW_REGISTER_TYPE_F; 199} 200 201/** Fixed HW reg constructor. */ 202fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type) 203{ 204 init(); 205 this->file = file; 206 this->hw_reg = hw_reg; 207 this->type = type; 208} 209 210/** Automatic reg constructor. */ 211fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type) 212{ 213 init(); 214 215 this->file = GRF; 216 this->reg = v->virtual_grf_alloc(v->type_size(type)); 217 this->reg_offset = 0; 218 this->type = brw_type_for_base_type(type); 219} 220 221fs_reg * 222fs_visitor::variable_storage(ir_variable *var) 223{ 224 return (fs_reg *)hash_table_find(this->variable_ht, var); 225} 226 227void 228import_uniforms_callback(const void *key, 229 void *data, 230 void *closure) 231{ 232 struct hash_table *dst_ht = (struct hash_table *)closure; 233 const fs_reg *reg = (const fs_reg *)data; 234 235 if (reg->file != UNIFORM) 236 return; 237 238 hash_table_insert(dst_ht, data, key); 239} 240 241/* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch. 242 * This brings in those uniform definitions 243 */ 244void 245fs_visitor::import_uniforms(struct hash_table *src_variable_ht) 246{ 247 hash_table_call_foreach(src_variable_ht, 248 import_uniforms_callback, 249 variable_ht); 250} 251 252/* Our support for uniforms is piggy-backed on the struct 253 * gl_fragment_program, because that's where the values actually 254 * get stored, rather than in some global gl_shader_program uniform 255 * store. 256 */ 257int 258fs_visitor::setup_uniform_values(int loc, const glsl_type *type) 259{ 260 unsigned int offset = 0; 261 262 if (type->is_matrix()) { 263 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT, 264 type->vector_elements, 265 1); 266 267 for (unsigned int i = 0; i < type->matrix_columns; i++) { 268 offset += setup_uniform_values(loc + offset, column); 269 } 270 271 return offset; 272 } 273 274 switch (type->base_type) { 275 case GLSL_TYPE_FLOAT: 276 case GLSL_TYPE_UINT: 277 case GLSL_TYPE_INT: 278 case GLSL_TYPE_BOOL: 279 for (unsigned int i = 0; i < type->vector_elements; i++) { 280 unsigned int param = c->prog_data.nr_params++; 281 282 assert(param < ARRAY_SIZE(c->prog_data.param)); 283 284 switch (type->base_type) { 285 case GLSL_TYPE_FLOAT: 286 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 287 break; 288 case GLSL_TYPE_UINT: 289 c->prog_data.param_convert[param] = PARAM_CONVERT_F2U; 290 break; 291 case GLSL_TYPE_INT: 292 c->prog_data.param_convert[param] = PARAM_CONVERT_F2I; 293 break; 294 case GLSL_TYPE_BOOL: 295 c->prog_data.param_convert[param] = PARAM_CONVERT_F2B; 296 break; 297 default: 298 assert(!"not reached"); 299 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 300 break; 301 } 302 this->param_index[param] = loc; 303 this->param_offset[param] = i; 304 } 305 return 1; 306 307 case GLSL_TYPE_STRUCT: 308 for (unsigned int i = 0; i < type->length; i++) { 309 offset += setup_uniform_values(loc + offset, 310 type->fields.structure[i].type); 311 } 312 return offset; 313 314 case GLSL_TYPE_ARRAY: 315 for (unsigned int i = 0; i < type->length; i++) { 316 offset += setup_uniform_values(loc + offset, type->fields.array); 317 } 318 return offset; 319 320 case GLSL_TYPE_SAMPLER: 321 /* The sampler takes up a slot, but we don't use any values from it. */ 322 return 1; 323 324 default: 325 assert(!"not reached"); 326 return 0; 327 } 328} 329 330 331/* Our support for builtin uniforms is even scarier than non-builtin. 332 * It sits on top of the PROG_STATE_VAR parameters that are 333 * automatically updated from GL context state. 334 */ 335void 336fs_visitor::setup_builtin_uniform_values(ir_variable *ir) 337{ 338 const ir_state_slot *const slots = ir->state_slots; 339 assert(ir->state_slots != NULL); 340 341 for (unsigned int i = 0; i < ir->num_state_slots; i++) { 342 /* This state reference has already been setup by ir_to_mesa, but we'll 343 * get the same index back here. 344 */ 345 int index = _mesa_add_state_reference(this->fp->Base.Parameters, 346 (gl_state_index *)slots[i].tokens); 347 348 /* Add each of the unique swizzles of the element as a parameter. 349 * This'll end up matching the expected layout of the 350 * array/matrix/structure we're trying to fill in. 351 */ 352 int last_swiz = -1; 353 for (unsigned int j = 0; j < 4; j++) { 354 int swiz = GET_SWZ(slots[i].swizzle, j); 355 if (swiz == last_swiz) 356 break; 357 last_swiz = swiz; 358 359 c->prog_data.param_convert[c->prog_data.nr_params] = 360 PARAM_NO_CONVERT; 361 this->param_index[c->prog_data.nr_params] = index; 362 this->param_offset[c->prog_data.nr_params] = swiz; 363 c->prog_data.nr_params++; 364 } 365 } 366} 367 368fs_reg * 369fs_visitor::emit_fragcoord_interpolation(ir_variable *ir) 370{ 371 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 372 fs_reg wpos = *reg; 373 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo; 374 375 /* gl_FragCoord.x */ 376 if (ir->pixel_center_integer) { 377 emit(BRW_OPCODE_MOV, wpos, this->pixel_x); 378 } else { 379 emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f)); 380 } 381 wpos.reg_offset++; 382 383 /* gl_FragCoord.y */ 384 if (!flip && ir->pixel_center_integer) { 385 emit(BRW_OPCODE_MOV, wpos, this->pixel_y); 386 } else { 387 fs_reg pixel_y = this->pixel_y; 388 float offset = (ir->pixel_center_integer ? 0.0 : 0.5); 389 390 if (flip) { 391 pixel_y.negate = true; 392 offset += c->key.drawable_height - 1.0; 393 } 394 395 emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset)); 396 } 397 wpos.reg_offset++; 398 399 /* gl_FragCoord.z */ 400 if (intel->gen >= 6) { 401 emit(BRW_OPCODE_MOV, wpos, 402 fs_reg(brw_vec8_grf(c->source_depth_reg, 0))); 403 } else { 404 emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y, 405 interp_reg(FRAG_ATTRIB_WPOS, 2)); 406 } 407 wpos.reg_offset++; 408 409 /* gl_FragCoord.w: Already set up in emit_interpolation */ 410 emit(BRW_OPCODE_MOV, wpos, this->wpos_w); 411 412 return reg; 413} 414 415fs_reg * 416fs_visitor::emit_general_interpolation(ir_variable *ir) 417{ 418 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 419 /* Interpolation is always in floating point regs. */ 420 reg->type = BRW_REGISTER_TYPE_F; 421 fs_reg attr = *reg; 422 423 unsigned int array_elements; 424 const glsl_type *type; 425 426 if (ir->type->is_array()) { 427 array_elements = ir->type->length; 428 if (array_elements == 0) { 429 fail("dereferenced array '%s' has length 0\n", ir->name); 430 } 431 type = ir->type->fields.array; 432 } else { 433 array_elements = 1; 434 type = ir->type; 435 } 436 437 int location = ir->location; 438 for (unsigned int i = 0; i < array_elements; i++) { 439 for (unsigned int j = 0; j < type->matrix_columns; j++) { 440 if (urb_setup[location] == -1) { 441 /* If there's no incoming setup data for this slot, don't 442 * emit interpolation for it. 443 */ 444 attr.reg_offset += type->vector_elements; 445 location++; 446 continue; 447 } 448 449 bool is_gl_Color = 450 location == FRAG_ATTRIB_COL0 || location == FRAG_ATTRIB_COL1; 451 452 if (c->key.flat_shade && is_gl_Color) { 453 /* Constant interpolation (flat shading) case. The SF has 454 * handed us defined values in only the constant offset 455 * field of the setup reg. 456 */ 457 for (unsigned int k = 0; k < type->vector_elements; k++) { 458 struct brw_reg interp = interp_reg(location, k); 459 interp = suboffset(interp, 3); 460 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp)); 461 attr.reg_offset++; 462 } 463 } else { 464 /* Perspective interpolation case. */ 465 for (unsigned int k = 0; k < type->vector_elements; k++) { 466 struct brw_reg interp = interp_reg(location, k); 467 emit(FS_OPCODE_LINTERP, attr, 468 this->delta_x, this->delta_y, fs_reg(interp)); 469 attr.reg_offset++; 470 } 471 472 if (intel->gen < 6) { 473 attr.reg_offset -= type->vector_elements; 474 for (unsigned int k = 0; k < type->vector_elements; k++) { 475 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w); 476 attr.reg_offset++; 477 } 478 } 479 } 480 location++; 481 } 482 } 483 484 return reg; 485} 486 487fs_reg * 488fs_visitor::emit_frontfacing_interpolation(ir_variable *ir) 489{ 490 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 491 492 /* The frontfacing comes in as a bit in the thread payload. */ 493 if (intel->gen >= 6) { 494 emit(BRW_OPCODE_ASR, *reg, 495 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)), 496 fs_reg(15)); 497 emit(BRW_OPCODE_NOT, *reg, *reg); 498 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1)); 499 } else { 500 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); 501 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives 502 * us front face 503 */ 504 fs_inst *inst = emit(BRW_OPCODE_CMP, *reg, 505 fs_reg(r1_6ud), 506 fs_reg(1u << 31)); 507 inst->conditional_mod = BRW_CONDITIONAL_L; 508 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u)); 509 } 510 511 return reg; 512} 513 514fs_inst * 515fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src) 516{ 517 switch (opcode) { 518 case FS_OPCODE_RCP: 519 case FS_OPCODE_RSQ: 520 case FS_OPCODE_SQRT: 521 case FS_OPCODE_EXP2: 522 case FS_OPCODE_LOG2: 523 case FS_OPCODE_SIN: 524 case FS_OPCODE_COS: 525 break; 526 default: 527 assert(!"not reached: bad math opcode"); 528 return NULL; 529 } 530 531 /* Can't do hstride == 0 args to gen6 math, so expand it out. We 532 * might be able to do better by doing execsize = 1 math and then 533 * expanding that result out, but we would need to be careful with 534 * masking. 535 * 536 * The hardware ignores source modifiers (negate and abs) on math 537 * instructions, so we also move to a temp to set those up. 538 */ 539 if (intel->gen >= 6 && (src.file == UNIFORM || 540 src.abs || 541 src.negate)) { 542 fs_reg expanded = fs_reg(this, glsl_type::float_type); 543 emit(BRW_OPCODE_MOV, expanded, src); 544 src = expanded; 545 } 546 547 fs_inst *inst = emit(opcode, dst, src); 548 549 if (intel->gen < 6) { 550 inst->base_mrf = 2; 551 inst->mlen = c->dispatch_width / 8; 552 } 553 554 return inst; 555} 556 557fs_inst * 558fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1) 559{ 560 int base_mrf = 2; 561 fs_inst *inst; 562 563 assert(opcode == FS_OPCODE_POW); 564 565 if (intel->gen >= 6) { 566 /* Can't do hstride == 0 args to gen6 math, so expand it out. 567 * 568 * The hardware ignores source modifiers (negate and abs) on math 569 * instructions, so we also move to a temp to set those up. 570 */ 571 if (src0.file == UNIFORM || src0.abs || src0.negate) { 572 fs_reg expanded = fs_reg(this, glsl_type::float_type); 573 emit(BRW_OPCODE_MOV, expanded, src0); 574 src0 = expanded; 575 } 576 577 if (src1.file == UNIFORM || src1.abs || src1.negate) { 578 fs_reg expanded = fs_reg(this, glsl_type::float_type); 579 emit(BRW_OPCODE_MOV, expanded, src1); 580 src1 = expanded; 581 } 582 583 inst = emit(opcode, dst, src0, src1); 584 } else { 585 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1); 586 inst = emit(opcode, dst, src0, reg_null_f); 587 588 inst->base_mrf = base_mrf; 589 inst->mlen = 2 * c->dispatch_width / 8; 590 } 591 return inst; 592} 593 594/** 595 * To be called after the last _mesa_add_state_reference() call, to 596 * set up prog_data.param[] for assign_curb_setup() and 597 * setup_pull_constants(). 598 */ 599void 600fs_visitor::setup_paramvalues_refs() 601{ 602 if (c->dispatch_width != 8) 603 return; 604 605 /* Set up the pointers to ParamValues now that that array is finalized. */ 606 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 607 c->prog_data.param[i] = 608 fp->Base.Parameters->ParameterValues[this->param_index[i]] + 609 this->param_offset[i]; 610 } 611} 612 613void 614fs_visitor::assign_curb_setup() 615{ 616 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8; 617 if (c->dispatch_width == 8) { 618 c->prog_data.first_curbe_grf = c->nr_payload_regs; 619 } else { 620 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs; 621 } 622 623 /* Map the offsets in the UNIFORM file to fixed HW regs. */ 624 foreach_iter(exec_list_iterator, iter, this->instructions) { 625 fs_inst *inst = (fs_inst *)iter.get(); 626 627 for (unsigned int i = 0; i < 3; i++) { 628 if (inst->src[i].file == UNIFORM) { 629 int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 630 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs + 631 constant_nr / 8, 632 constant_nr % 8); 633 634 inst->src[i].file = FIXED_HW_REG; 635 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type); 636 } 637 } 638 } 639} 640 641void 642fs_visitor::calculate_urb_setup() 643{ 644 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 645 urb_setup[i] = -1; 646 } 647 648 int urb_next = 0; 649 /* Figure out where each of the incoming setup attributes lands. */ 650 if (intel->gen >= 6) { 651 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 652 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) { 653 urb_setup[i] = urb_next++; 654 } 655 } 656 } else { 657 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */ 658 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) { 659 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) { 660 int fp_index; 661 662 if (i >= VERT_RESULT_VAR0) 663 fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0); 664 else if (i <= VERT_RESULT_TEX7) 665 fp_index = i; 666 else 667 fp_index = -1; 668 669 if (fp_index >= 0) 670 urb_setup[fp_index] = urb_next++; 671 } 672 } 673 } 674 675 /* Each attribute is 4 setup channels, each of which is half a reg. */ 676 c->prog_data.urb_read_length = urb_next * 2; 677} 678 679void 680fs_visitor::assign_urb_setup() 681{ 682 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length; 683 684 /* Offset all the urb_setup[] index by the actual position of the 685 * setup regs, now that the location of the constants has been chosen. 686 */ 687 foreach_iter(exec_list_iterator, iter, this->instructions) { 688 fs_inst *inst = (fs_inst *)iter.get(); 689 690 if (inst->opcode == FS_OPCODE_LINTERP) { 691 assert(inst->src[2].file == FIXED_HW_REG); 692 inst->src[2].fixed_hw_reg.nr += urb_start; 693 } 694 695 if (inst->opcode == FS_OPCODE_CINTERP) { 696 assert(inst->src[0].file == FIXED_HW_REG); 697 inst->src[0].fixed_hw_reg.nr += urb_start; 698 } 699 } 700 701 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length; 702} 703 704/** 705 * Split large virtual GRFs into separate components if we can. 706 * 707 * This is mostly duplicated with what brw_fs_vector_splitting does, 708 * but that's really conservative because it's afraid of doing 709 * splitting that doesn't result in real progress after the rest of 710 * the optimization phases, which would cause infinite looping in 711 * optimization. We can do it once here, safely. This also has the 712 * opportunity to split interpolated values, or maybe even uniforms, 713 * which we don't have at the IR level. 714 * 715 * We want to split, because virtual GRFs are what we register 716 * allocate and spill (due to contiguousness requirements for some 717 * instructions), and they're what we naturally generate in the 718 * codegen process, but most virtual GRFs don't actually need to be 719 * contiguous sets of GRFs. If we split, we'll end up with reduced 720 * live intervals and better dead code elimination and coalescing. 721 */ 722void 723fs_visitor::split_virtual_grfs() 724{ 725 int num_vars = this->virtual_grf_next; 726 bool split_grf[num_vars]; 727 int new_virtual_grf[num_vars]; 728 729 /* Try to split anything > 0 sized. */ 730 for (int i = 0; i < num_vars; i++) { 731 if (this->virtual_grf_sizes[i] != 1) 732 split_grf[i] = true; 733 else 734 split_grf[i] = false; 735 } 736 737 if (brw->has_pln) { 738 /* PLN opcodes rely on the delta_xy being contiguous. */ 739 split_grf[this->delta_x.reg] = false; 740 } 741 742 foreach_iter(exec_list_iterator, iter, this->instructions) { 743 fs_inst *inst = (fs_inst *)iter.get(); 744 745 /* Texturing produces 4 contiguous registers, so no splitting. */ 746 if (inst->is_tex()) { 747 split_grf[inst->dst.reg] = false; 748 } 749 } 750 751 /* Allocate new space for split regs. Note that the virtual 752 * numbers will be contiguous. 753 */ 754 for (int i = 0; i < num_vars; i++) { 755 if (split_grf[i]) { 756 new_virtual_grf[i] = virtual_grf_alloc(1); 757 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) { 758 int reg = virtual_grf_alloc(1); 759 assert(reg == new_virtual_grf[i] + j - 1); 760 (void) reg; 761 } 762 this->virtual_grf_sizes[i] = 1; 763 } 764 } 765 766 foreach_iter(exec_list_iterator, iter, this->instructions) { 767 fs_inst *inst = (fs_inst *)iter.get(); 768 769 if (inst->dst.file == GRF && 770 split_grf[inst->dst.reg] && 771 inst->dst.reg_offset != 0) { 772 inst->dst.reg = (new_virtual_grf[inst->dst.reg] + 773 inst->dst.reg_offset - 1); 774 inst->dst.reg_offset = 0; 775 } 776 for (int i = 0; i < 3; i++) { 777 if (inst->src[i].file == GRF && 778 split_grf[inst->src[i].reg] && 779 inst->src[i].reg_offset != 0) { 780 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] + 781 inst->src[i].reg_offset - 1); 782 inst->src[i].reg_offset = 0; 783 } 784 } 785 } 786 this->live_intervals_valid = false; 787} 788 789/** 790 * Choose accesses from the UNIFORM file to demote to using the pull 791 * constant buffer. 792 * 793 * We allow a fragment shader to have more than the specified minimum 794 * maximum number of fragment shader uniform components (64). If 795 * there are too many of these, they'd fill up all of register space. 796 * So, this will push some of them out to the pull constant buffer and 797 * update the program to load them. 798 */ 799void 800fs_visitor::setup_pull_constants() 801{ 802 /* Only allow 16 registers (128 uniform components) as push constants. */ 803 unsigned int max_uniform_components = 16 * 8; 804 if (c->prog_data.nr_params <= max_uniform_components) 805 return; 806 807 if (c->dispatch_width == 16) { 808 fail("Pull constants not supported in 16-wide\n"); 809 return; 810 } 811 812 /* Just demote the end of the list. We could probably do better 813 * here, demoting things that are rarely used in the program first. 814 */ 815 int pull_uniform_base = max_uniform_components; 816 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base; 817 818 foreach_iter(exec_list_iterator, iter, this->instructions) { 819 fs_inst *inst = (fs_inst *)iter.get(); 820 821 for (int i = 0; i < 3; i++) { 822 if (inst->src[i].file != UNIFORM) 823 continue; 824 825 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 826 if (uniform_nr < pull_uniform_base) 827 continue; 828 829 fs_reg dst = fs_reg(this, glsl_type::float_type); 830 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD, 831 dst); 832 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15; 833 pull->ir = inst->ir; 834 pull->annotation = inst->annotation; 835 pull->base_mrf = 14; 836 pull->mlen = 1; 837 838 inst->insert_before(pull); 839 840 inst->src[i].file = GRF; 841 inst->src[i].reg = dst.reg; 842 inst->src[i].reg_offset = 0; 843 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3; 844 } 845 } 846 847 for (int i = 0; i < pull_uniform_count; i++) { 848 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i]; 849 c->prog_data.pull_param_convert[i] = 850 c->prog_data.param_convert[pull_uniform_base + i]; 851 } 852 c->prog_data.nr_params -= pull_uniform_count; 853 c->prog_data.nr_pull_params = pull_uniform_count; 854} 855 856void 857fs_visitor::calculate_live_intervals() 858{ 859 int num_vars = this->virtual_grf_next; 860 int *def = ralloc_array(mem_ctx, int, num_vars); 861 int *use = ralloc_array(mem_ctx, int, num_vars); 862 int loop_depth = 0; 863 int loop_start = 0; 864 865 if (this->live_intervals_valid) 866 return; 867 868 for (int i = 0; i < num_vars; i++) { 869 def[i] = MAX_INSTRUCTION; 870 use[i] = -1; 871 } 872 873 int ip = 0; 874 foreach_iter(exec_list_iterator, iter, this->instructions) { 875 fs_inst *inst = (fs_inst *)iter.get(); 876 877 if (inst->opcode == BRW_OPCODE_DO) { 878 if (loop_depth++ == 0) 879 loop_start = ip; 880 } else if (inst->opcode == BRW_OPCODE_WHILE) { 881 loop_depth--; 882 883 if (loop_depth == 0) { 884 /* Patches up the use of vars marked for being live across 885 * the whole loop. 886 */ 887 for (int i = 0; i < num_vars; i++) { 888 if (use[i] == loop_start) { 889 use[i] = ip; 890 } 891 } 892 } 893 } else { 894 for (unsigned int i = 0; i < 3; i++) { 895 if (inst->src[i].file == GRF && inst->src[i].reg != 0) { 896 int reg = inst->src[i].reg; 897 898 if (!loop_depth) { 899 use[reg] = ip; 900 } else { 901 def[reg] = MIN2(loop_start, def[reg]); 902 use[reg] = loop_start; 903 904 /* Nobody else is going to go smash our start to 905 * later in the loop now, because def[reg] now 906 * points before the bb header. 907 */ 908 } 909 } 910 } 911 if (inst->dst.file == GRF && inst->dst.reg != 0) { 912 int reg = inst->dst.reg; 913 914 if (!loop_depth) { 915 def[reg] = MIN2(def[reg], ip); 916 } else { 917 def[reg] = MIN2(def[reg], loop_start); 918 } 919 } 920 } 921 922 ip++; 923 } 924 925 ralloc_free(this->virtual_grf_def); 926 ralloc_free(this->virtual_grf_use); 927 this->virtual_grf_def = def; 928 this->virtual_grf_use = use; 929 930 this->live_intervals_valid = true; 931} 932 933/** 934 * Attempts to move immediate constants into the immediate 935 * constant slot of following instructions. 936 * 937 * Immediate constants are a bit tricky -- they have to be in the last 938 * operand slot, you can't do abs/negate on them, 939 */ 940 941bool 942fs_visitor::propagate_constants() 943{ 944 bool progress = false; 945 946 calculate_live_intervals(); 947 948 foreach_iter(exec_list_iterator, iter, this->instructions) { 949 fs_inst *inst = (fs_inst *)iter.get(); 950 951 if (inst->opcode != BRW_OPCODE_MOV || 952 inst->predicated || 953 inst->dst.file != GRF || inst->src[0].file != IMM || 954 inst->dst.type != inst->src[0].type || 955 (c->dispatch_width == 16 && 956 (inst->force_uncompressed || inst->force_sechalf))) 957 continue; 958 959 /* Don't bother with cases where we should have had the 960 * operation on the constant folded in GLSL already. 961 */ 962 if (inst->saturate) 963 continue; 964 965 /* Found a move of a constant to a GRF. Find anything else using the GRF 966 * before it's written, and replace it with the constant if we can. 967 */ 968 exec_list_iterator scan_iter = iter; 969 scan_iter.next(); 970 for (; scan_iter.has_next(); scan_iter.next()) { 971 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 972 973 if (scan_inst->opcode == BRW_OPCODE_DO || 974 scan_inst->opcode == BRW_OPCODE_WHILE || 975 scan_inst->opcode == BRW_OPCODE_ELSE || 976 scan_inst->opcode == BRW_OPCODE_ENDIF) { 977 break; 978 } 979 980 for (int i = 2; i >= 0; i--) { 981 if (scan_inst->src[i].file != GRF || 982 scan_inst->src[i].reg != inst->dst.reg || 983 scan_inst->src[i].reg_offset != inst->dst.reg_offset) 984 continue; 985 986 /* Don't bother with cases where we should have had the 987 * operation on the constant folded in GLSL already. 988 */ 989 if (scan_inst->src[i].negate || scan_inst->src[i].abs) 990 continue; 991 992 switch (scan_inst->opcode) { 993 case BRW_OPCODE_MOV: 994 scan_inst->src[i] = inst->src[0]; 995 progress = true; 996 break; 997 998 case BRW_OPCODE_MUL: 999 case BRW_OPCODE_ADD: 1000 if (i == 1) { 1001 scan_inst->src[i] = inst->src[0]; 1002 progress = true; 1003 } else if (i == 0 && scan_inst->src[1].file != IMM) { 1004 /* Fit this constant in by commuting the operands */ 1005 scan_inst->src[0] = scan_inst->src[1]; 1006 scan_inst->src[1] = inst->src[0]; 1007 progress = true; 1008 } 1009 break; 1010 1011 case BRW_OPCODE_CMP: 1012 if (i == 1) { 1013 scan_inst->src[i] = inst->src[0]; 1014 progress = true; 1015 } else if (i == 0 && scan_inst->src[1].file != IMM) { 1016 uint32_t new_cmod; 1017 1018 new_cmod = brw_swap_cmod(scan_inst->conditional_mod); 1019 if (new_cmod != ~0u) { 1020 /* Fit this constant in by swapping the operands and 1021 * flipping the test 1022 */ 1023 scan_inst->src[0] = scan_inst->src[1]; 1024 scan_inst->src[1] = inst->src[0]; 1025 scan_inst->conditional_mod = new_cmod; 1026 progress = true; 1027 } 1028 } 1029 break; 1030 1031 case BRW_OPCODE_SEL: 1032 if (i == 1) { 1033 scan_inst->src[i] = inst->src[0]; 1034 progress = true; 1035 } else if (i == 0 && scan_inst->src[1].file != IMM) { 1036 /* Fit this constant in by swapping the operands and 1037 * flipping the predicate 1038 */ 1039 scan_inst->src[0] = scan_inst->src[1]; 1040 scan_inst->src[1] = inst->src[0]; 1041 scan_inst->predicate_inverse = !scan_inst->predicate_inverse; 1042 progress = true; 1043 } 1044 break; 1045 } 1046 } 1047 1048 if (scan_inst->dst.file == GRF && 1049 scan_inst->dst.reg == inst->dst.reg && 1050 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 1051 scan_inst->is_tex())) { 1052 break; 1053 } 1054 } 1055 } 1056 1057 if (progress) 1058 this->live_intervals_valid = false; 1059 1060 return progress; 1061} 1062/** 1063 * Must be called after calculate_live_intervales() to remove unused 1064 * writes to registers -- register allocation will fail otherwise 1065 * because something deffed but not used won't be considered to 1066 * interfere with other regs. 1067 */ 1068bool 1069fs_visitor::dead_code_eliminate() 1070{ 1071 bool progress = false; 1072 int pc = 0; 1073 1074 calculate_live_intervals(); 1075 1076 foreach_iter(exec_list_iterator, iter, this->instructions) { 1077 fs_inst *inst = (fs_inst *)iter.get(); 1078 1079 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) { 1080 inst->remove(); 1081 progress = true; 1082 } 1083 1084 pc++; 1085 } 1086 1087 if (progress) 1088 live_intervals_valid = false; 1089 1090 return progress; 1091} 1092 1093bool 1094fs_visitor::register_coalesce() 1095{ 1096 bool progress = false; 1097 int if_depth = 0; 1098 int loop_depth = 0; 1099 1100 foreach_iter(exec_list_iterator, iter, this->instructions) { 1101 fs_inst *inst = (fs_inst *)iter.get(); 1102 1103 /* Make sure that we dominate the instructions we're going to 1104 * scan for interfering with our coalescing, or we won't have 1105 * scanned enough to see if anything interferes with our 1106 * coalescing. We don't dominate the following instructions if 1107 * we're in a loop or an if block. 1108 */ 1109 switch (inst->opcode) { 1110 case BRW_OPCODE_DO: 1111 loop_depth++; 1112 break; 1113 case BRW_OPCODE_WHILE: 1114 loop_depth--; 1115 break; 1116 case BRW_OPCODE_IF: 1117 if_depth++; 1118 break; 1119 case BRW_OPCODE_ENDIF: 1120 if_depth--; 1121 break; 1122 } 1123 if (loop_depth || if_depth) 1124 continue; 1125 1126 if (inst->opcode != BRW_OPCODE_MOV || 1127 inst->predicated || 1128 inst->saturate || 1129 inst->dst.file != GRF || inst->src[0].file != GRF || 1130 inst->dst.type != inst->src[0].type) 1131 continue; 1132 1133 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate; 1134 1135 /* Found a move of a GRF to a GRF. Let's see if we can coalesce 1136 * them: check for no writes to either one until the exit of the 1137 * program. 1138 */ 1139 bool interfered = false; 1140 exec_list_iterator scan_iter = iter; 1141 scan_iter.next(); 1142 for (; scan_iter.has_next(); scan_iter.next()) { 1143 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 1144 1145 if (scan_inst->dst.file == GRF) { 1146 if (scan_inst->dst.reg == inst->dst.reg && 1147 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 1148 scan_inst->is_tex())) { 1149 interfered = true; 1150 break; 1151 } 1152 if (scan_inst->dst.reg == inst->src[0].reg && 1153 (scan_inst->dst.reg_offset == inst->src[0].reg_offset || 1154 scan_inst->is_tex())) { 1155 interfered = true; 1156 break; 1157 } 1158 } 1159 1160 /* The gen6 MATH instruction can't handle source modifiers, so avoid 1161 * coalescing those for now. We should do something more specific. 1162 */ 1163 if (intel->gen >= 6 && scan_inst->is_math() && has_source_modifiers) { 1164 interfered = true; 1165 break; 1166 } 1167 } 1168 if (interfered) { 1169 continue; 1170 } 1171 1172 /* Rewrite the later usage to point at the source of the move to 1173 * be removed. 1174 */ 1175 for (exec_list_iterator scan_iter = iter; scan_iter.has_next(); 1176 scan_iter.next()) { 1177 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 1178 1179 for (int i = 0; i < 3; i++) { 1180 if (scan_inst->src[i].file == GRF && 1181 scan_inst->src[i].reg == inst->dst.reg && 1182 scan_inst->src[i].reg_offset == inst->dst.reg_offset) { 1183 scan_inst->src[i].reg = inst->src[0].reg; 1184 scan_inst->src[i].reg_offset = inst->src[0].reg_offset; 1185 scan_inst->src[i].abs |= inst->src[0].abs; 1186 scan_inst->src[i].negate ^= inst->src[0].negate; 1187 scan_inst->src[i].smear = inst->src[0].smear; 1188 } 1189 } 1190 } 1191 1192 inst->remove(); 1193 progress = true; 1194 } 1195 1196 if (progress) 1197 live_intervals_valid = false; 1198 1199 return progress; 1200} 1201 1202 1203bool 1204fs_visitor::compute_to_mrf() 1205{ 1206 bool progress = false; 1207 int next_ip = 0; 1208 1209 calculate_live_intervals(); 1210 1211 foreach_iter(exec_list_iterator, iter, this->instructions) { 1212 fs_inst *inst = (fs_inst *)iter.get(); 1213 1214 int ip = next_ip; 1215 next_ip++; 1216 1217 if (inst->opcode != BRW_OPCODE_MOV || 1218 inst->predicated || 1219 inst->dst.file != MRF || inst->src[0].file != GRF || 1220 inst->dst.type != inst->src[0].type || 1221 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1) 1222 continue; 1223 1224 /* Work out which hardware MRF registers are written by this 1225 * instruction. 1226 */ 1227 int mrf_low = inst->dst.hw_reg & ~BRW_MRF_COMPR4; 1228 int mrf_high; 1229 if (inst->dst.hw_reg & BRW_MRF_COMPR4) { 1230 mrf_high = mrf_low + 4; 1231 } else if (c->dispatch_width == 16 && 1232 (!inst->force_uncompressed && !inst->force_sechalf)) { 1233 mrf_high = mrf_low + 1; 1234 } else { 1235 mrf_high = mrf_low; 1236 } 1237 1238 /* Can't compute-to-MRF this GRF if someone else was going to 1239 * read it later. 1240 */ 1241 if (this->virtual_grf_use[inst->src[0].reg] > ip) 1242 continue; 1243 1244 /* Found a move of a GRF to a MRF. Let's see if we can go 1245 * rewrite the thing that made this GRF to write into the MRF. 1246 */ 1247 fs_inst *scan_inst; 1248 for (scan_inst = (fs_inst *)inst->prev; 1249 scan_inst->prev != NULL; 1250 scan_inst = (fs_inst *)scan_inst->prev) { 1251 if (scan_inst->dst.file == GRF && 1252 scan_inst->dst.reg == inst->src[0].reg) { 1253 /* Found the last thing to write our reg we want to turn 1254 * into a compute-to-MRF. 1255 */ 1256 1257 if (scan_inst->is_tex()) { 1258 /* texturing writes several continuous regs, so we can't 1259 * compute-to-mrf that. 1260 */ 1261 break; 1262 } 1263 1264 /* If it's predicated, it (probably) didn't populate all 1265 * the channels. We might be able to rewrite everything 1266 * that writes that reg, but it would require smarter 1267 * tracking to delay the rewriting until complete success. 1268 */ 1269 if (scan_inst->predicated) 1270 break; 1271 1272 /* If it's half of register setup and not the same half as 1273 * our MOV we're trying to remove, bail for now. 1274 */ 1275 if (scan_inst->force_uncompressed != inst->force_uncompressed || 1276 scan_inst->force_sechalf != inst->force_sechalf) { 1277 break; 1278 } 1279 1280 /* SEND instructions can't have MRF as a destination. */ 1281 if (scan_inst->mlen) 1282 break; 1283 1284 if (intel->gen >= 6) { 1285 /* gen6 math instructions must have the destination be 1286 * GRF, so no compute-to-MRF for them. 1287 */ 1288 if (scan_inst->is_math()) { 1289 break; 1290 } 1291 } 1292 1293 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 1294 /* Found the creator of our MRF's source value. */ 1295 scan_inst->dst.file = MRF; 1296 scan_inst->dst.hw_reg = inst->dst.hw_reg; 1297 scan_inst->saturate |= inst->saturate; 1298 inst->remove(); 1299 progress = true; 1300 } 1301 break; 1302 } 1303 1304 /* We don't handle flow control here. Most computation of 1305 * values that end up in MRFs are shortly before the MRF 1306 * write anyway. 1307 */ 1308 if (scan_inst->opcode == BRW_OPCODE_DO || 1309 scan_inst->opcode == BRW_OPCODE_WHILE || 1310 scan_inst->opcode == BRW_OPCODE_ELSE || 1311 scan_inst->opcode == BRW_OPCODE_ENDIF) { 1312 break; 1313 } 1314 1315 /* You can't read from an MRF, so if someone else reads our 1316 * MRF's source GRF that we wanted to rewrite, that stops us. 1317 */ 1318 bool interfered = false; 1319 for (int i = 0; i < 3; i++) { 1320 if (scan_inst->src[i].file == GRF && 1321 scan_inst->src[i].reg == inst->src[0].reg && 1322 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { 1323 interfered = true; 1324 } 1325 } 1326 if (interfered) 1327 break; 1328 1329 if (scan_inst->dst.file == MRF) { 1330 /* If somebody else writes our MRF here, we can't 1331 * compute-to-MRF before that. 1332 */ 1333 int scan_mrf_low = scan_inst->dst.hw_reg & ~BRW_MRF_COMPR4; 1334 int scan_mrf_high; 1335 1336 if (scan_inst->dst.hw_reg & BRW_MRF_COMPR4) { 1337 scan_mrf_high = scan_mrf_low + 4; 1338 } else if (c->dispatch_width == 16 && 1339 (!scan_inst->force_uncompressed && 1340 !scan_inst->force_sechalf)) { 1341 scan_mrf_high = scan_mrf_low + 1; 1342 } else { 1343 scan_mrf_high = scan_mrf_low; 1344 } 1345 1346 if (mrf_low == scan_mrf_low || 1347 mrf_low == scan_mrf_high || 1348 mrf_high == scan_mrf_low || 1349 mrf_high == scan_mrf_high) { 1350 break; 1351 } 1352 } 1353 1354 if (scan_inst->mlen > 0) { 1355 /* Found a SEND instruction, which means that there are 1356 * live values in MRFs from base_mrf to base_mrf + 1357 * scan_inst->mlen - 1. Don't go pushing our MRF write up 1358 * above it. 1359 */ 1360 if (mrf_low >= scan_inst->base_mrf && 1361 mrf_low < scan_inst->base_mrf + scan_inst->mlen) { 1362 break; 1363 } 1364 if (mrf_high >= scan_inst->base_mrf && 1365 mrf_high < scan_inst->base_mrf + scan_inst->mlen) { 1366 break; 1367 } 1368 } 1369 } 1370 } 1371 1372 return progress; 1373} 1374 1375/** 1376 * Walks through basic blocks, locking for repeated MRF writes and 1377 * removing the later ones. 1378 */ 1379bool 1380fs_visitor::remove_duplicate_mrf_writes() 1381{ 1382 fs_inst *last_mrf_move[16]; 1383 bool progress = false; 1384 1385 /* Need to update the MRF tracking for compressed instructions. */ 1386 if (c->dispatch_width == 16) 1387 return false; 1388 1389 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 1390 1391 foreach_iter(exec_list_iterator, iter, this->instructions) { 1392 fs_inst *inst = (fs_inst *)iter.get(); 1393 1394 switch (inst->opcode) { 1395 case BRW_OPCODE_DO: 1396 case BRW_OPCODE_WHILE: 1397 case BRW_OPCODE_IF: 1398 case BRW_OPCODE_ELSE: 1399 case BRW_OPCODE_ENDIF: 1400 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 1401 continue; 1402 default: 1403 break; 1404 } 1405 1406 if (inst->opcode == BRW_OPCODE_MOV && 1407 inst->dst.file == MRF) { 1408 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg]; 1409 if (prev_inst && inst->equals(prev_inst)) { 1410 inst->remove(); 1411 progress = true; 1412 continue; 1413 } 1414 } 1415 1416 /* Clear out the last-write records for MRFs that were overwritten. */ 1417 if (inst->dst.file == MRF) { 1418 last_mrf_move[inst->dst.hw_reg] = NULL; 1419 } 1420 1421 if (inst->mlen > 0) { 1422 /* Found a SEND instruction, which will include two or fewer 1423 * implied MRF writes. We could do better here. 1424 */ 1425 for (int i = 0; i < implied_mrf_writes(inst); i++) { 1426 last_mrf_move[inst->base_mrf + i] = NULL; 1427 } 1428 } 1429 1430 /* Clear out any MRF move records whose sources got overwritten. */ 1431 if (inst->dst.file == GRF) { 1432 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) { 1433 if (last_mrf_move[i] && 1434 last_mrf_move[i]->src[0].reg == inst->dst.reg) { 1435 last_mrf_move[i] = NULL; 1436 } 1437 } 1438 } 1439 1440 if (inst->opcode == BRW_OPCODE_MOV && 1441 inst->dst.file == MRF && 1442 inst->src[0].file == GRF && 1443 !inst->predicated) { 1444 last_mrf_move[inst->dst.hw_reg] = inst; 1445 } 1446 } 1447 1448 return progress; 1449} 1450 1451bool 1452fs_visitor::virtual_grf_interferes(int a, int b) 1453{ 1454 int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]); 1455 int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]); 1456 1457 /* We can't handle dead register writes here, without iterating 1458 * over the whole instruction stream to find every single dead 1459 * write to that register to compare to the live interval of the 1460 * other register. Just assert that dead_code_eliminate() has been 1461 * called. 1462 */ 1463 assert((this->virtual_grf_use[a] != -1 || 1464 this->virtual_grf_def[a] == MAX_INSTRUCTION) && 1465 (this->virtual_grf_use[b] != -1 || 1466 this->virtual_grf_def[b] == MAX_INSTRUCTION)); 1467 1468 /* If the register is used to store 16 values of less than float 1469 * size (only the case for pixel_[xy]), then we can't allocate 1470 * another dword-sized thing to that register that would be used in 1471 * the same instruction. This is because when the GPU decodes (for 1472 * example): 1473 * 1474 * (declare (in ) vec4 gl_FragCoord@0x97766a0) 1475 * add(16) g6<1>F g6<8,8,1>UW 0.5F { align1 compr }; 1476 * 1477 * it's actually processed as: 1478 * add(8) g6<1>F g6<8,8,1>UW 0.5F { align1 }; 1479 * add(8) g7<1>F g6.8<8,8,1>UW 0.5F { align1 sechalf }; 1480 * 1481 * so our second half values in g6 got overwritten in the first 1482 * half. 1483 */ 1484 if (c->dispatch_width == 16 && (this->pixel_x.reg == a || 1485 this->pixel_x.reg == b || 1486 this->pixel_y.reg == a || 1487 this->pixel_y.reg == b)) { 1488 return start <= end; 1489 } 1490 1491 return start < end; 1492} 1493 1494bool 1495fs_visitor::run() 1496{ 1497 uint32_t prog_offset_16 = 0; 1498 uint32_t orig_nr_params = c->prog_data.nr_params; 1499 1500 brw_wm_payload_setup(brw, c); 1501 1502 if (c->dispatch_width == 16) { 1503 /* align to 64 byte boundary. */ 1504 while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) { 1505 brw_NOP(p); 1506 } 1507 1508 /* Save off the start of this 16-wide program in case we succeed. */ 1509 prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction); 1510 1511 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 1512 } 1513 1514 if (0) { 1515 emit_dummy_fs(); 1516 } else { 1517 calculate_urb_setup(); 1518 if (intel->gen < 6) 1519 emit_interpolation_setup_gen4(); 1520 else 1521 emit_interpolation_setup_gen6(); 1522 1523 /* Generate FS IR for main(). (the visitor only descends into 1524 * functions called "main"). 1525 */ 1526 foreach_iter(exec_list_iterator, iter, *shader->ir) { 1527 ir_instruction *ir = (ir_instruction *)iter.get(); 1528 base_ir = ir; 1529 this->result = reg_undef; 1530 ir->accept(this); 1531 } 1532 1533 emit_fb_writes(); 1534 1535 split_virtual_grfs(); 1536 1537 setup_paramvalues_refs(); 1538 setup_pull_constants(); 1539 1540 bool progress; 1541 do { 1542 progress = false; 1543 1544 progress = remove_duplicate_mrf_writes() || progress; 1545 1546 progress = propagate_constants() || progress; 1547 progress = register_coalesce() || progress; 1548 progress = compute_to_mrf() || progress; 1549 progress = dead_code_eliminate() || progress; 1550 } while (progress); 1551 1552 schedule_instructions(); 1553 1554 assign_curb_setup(); 1555 assign_urb_setup(); 1556 1557 if (0) { 1558 /* Debug of register spilling: Go spill everything. */ 1559 int virtual_grf_count = virtual_grf_next; 1560 for (int i = 1; i < virtual_grf_count; i++) { 1561 spill_reg(i); 1562 } 1563 } 1564 1565 if (0) 1566 assign_regs_trivial(); 1567 else { 1568 while (!assign_regs()) { 1569 if (failed) 1570 break; 1571 } 1572 } 1573 } 1574 assert(force_uncompressed_stack == 0); 1575 assert(force_sechalf_stack == 0); 1576 1577 if (failed) 1578 return false; 1579 1580 generate_code(); 1581 1582 if (c->dispatch_width == 8) { 1583 c->prog_data.reg_blocks = brw_register_blocks(grf_used); 1584 } else { 1585 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used); 1586 c->prog_data.prog_offset_16 = prog_offset_16; 1587 1588 /* Make sure we didn't try to sneak in an extra uniform */ 1589 assert(orig_nr_params == c->prog_data.nr_params); 1590 } 1591 1592 return !failed; 1593} 1594 1595bool 1596brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c, 1597 struct gl_shader_program *prog) 1598{ 1599 struct intel_context *intel = &brw->intel; 1600 1601 if (!prog) 1602 return false; 1603 1604 struct brw_shader *shader = 1605 (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; 1606 if (!shader) 1607 return false; 1608 1609 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 1610 printf("GLSL IR for native fragment shader %d:\n", prog->Name); 1611 _mesa_print_ir(shader->ir, NULL); 1612 printf("\n\n"); 1613 } 1614 1615 /* Now the main event: Visit the shader IR and generate our FS IR for it. 1616 */ 1617 c->dispatch_width = 8; 1618 1619 fs_visitor v(c, prog, shader); 1620 if (!v.run()) { 1621 prog->LinkStatus = GL_FALSE; 1622 prog->InfoLog = ralloc_strdup(prog, v.fail_msg); 1623 1624 return false; 1625 } 1626 1627 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) { 1628 c->dispatch_width = 16; 1629 fs_visitor v2(c, prog, shader); 1630 v2.import_uniforms(v.variable_ht); 1631 v2.run(); 1632 } 1633 1634 c->prog_data.dispatch_width = 8; 1635 1636 return true; 1637} 1638 1639bool 1640brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog) 1641{ 1642 struct brw_context *brw = brw_context(ctx); 1643 struct brw_wm_prog_key key; 1644 struct gl_fragment_program *fp = prog->FragmentProgram; 1645 struct brw_fragment_program *bfp = brw_fragment_program(fp); 1646 1647 if (!fp) 1648 return true; 1649 1650 memset(&key, 0, sizeof(key)); 1651 1652 if (fp->UsesKill) 1653 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT; 1654 1655 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) 1656 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT; 1657 1658 /* Just assume depth testing. */ 1659 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT; 1660 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT; 1661 1662 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS); 1663 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) { 1664 int vp_index = -1; 1665 1666 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i))) 1667 continue; 1668 1669 key.proj_attrib_mask |= 1 << i; 1670 1671 if (i <= FRAG_ATTRIB_TEX7) 1672 vp_index = i; 1673 else if (i >= FRAG_ATTRIB_VAR0) 1674 vp_index = i - FRAG_ATTRIB_VAR0 + VERT_RESULT_VAR0; 1675 1676 if (vp_index >= 0) 1677 key.vp_outputs_written |= BITFIELD64_BIT(vp_index); 1678 } 1679 1680 key.clamp_fragment_color = true; 1681 1682 for (int i = 0; i < BRW_MAX_TEX_UNIT; i++) { 1683 /* FINISHME: depth compares might use (0,0,0,W) for example */ 1684 key.tex_swizzles[i] = SWIZZLE_XYZW; 1685 } 1686 1687 if (fp->Base.InputsRead & FRAG_BIT_WPOS) { 1688 key.drawable_height = ctx->DrawBuffer->Height; 1689 key.render_to_fbo = ctx->DrawBuffer->Name != 0; 1690 } 1691 1692 key.nr_color_regions = 1; 1693 1694 key.program_string_id = bfp->id; 1695 1696 drm_intel_bo *old_prog_bo = brw->wm.prog_bo; 1697 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data; 1698 brw->wm.prog_bo = NULL; 1699 1700 bool success = do_wm_prog(brw, prog, bfp, &key); 1701 1702 drm_intel_bo_unreference(brw->wm.prog_bo); 1703 brw->wm.prog_bo = old_prog_bo; 1704 brw->wm.prog_data = old_prog_data; 1705 1706 return success; 1707} 1708