brw_fs.cpp revision 82d25963a838cfebdeb9b080169979329ee850ea
1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24/** @file brw_fs.cpp 25 * 26 * This file drives the GLSL IR -> LIR translation, contains the 27 * optimizations on the LIR, and drives the generation of native code 28 * from the LIR. 29 */ 30 31extern "C" { 32 33#include <sys/types.h> 34 35#include "main/macros.h" 36#include "main/shaderobj.h" 37#include "main/uniforms.h" 38#include "main/fbobject.h" 39#include "program/prog_parameter.h" 40#include "program/prog_print.h" 41#include "program/register_allocate.h" 42#include "program/sampler.h" 43#include "program/hash_table.h" 44#include "brw_context.h" 45#include "brw_eu.h" 46#include "brw_wm.h" 47} 48#include "brw_shader.h" 49#include "brw_fs.h" 50#include "glsl/glsl_types.h" 51#include "glsl/ir_print_visitor.h" 52 53int 54fs_visitor::type_size(const struct glsl_type *type) 55{ 56 unsigned int size, i; 57 58 switch (type->base_type) { 59 case GLSL_TYPE_UINT: 60 case GLSL_TYPE_INT: 61 case GLSL_TYPE_FLOAT: 62 case GLSL_TYPE_BOOL: 63 return type->components(); 64 case GLSL_TYPE_ARRAY: 65 return type_size(type->fields.array) * type->length; 66 case GLSL_TYPE_STRUCT: 67 size = 0; 68 for (i = 0; i < type->length; i++) { 69 size += type_size(type->fields.structure[i].type); 70 } 71 return size; 72 case GLSL_TYPE_SAMPLER: 73 /* Samplers take up no register space, since they're baked in at 74 * link time. 75 */ 76 return 0; 77 default: 78 assert(!"not reached"); 79 return 0; 80 } 81} 82 83void 84fs_visitor::fail(const char *format, ...) 85{ 86 va_list va; 87 char *msg; 88 89 if (failed) 90 return; 91 92 failed = true; 93 94 va_start(va, format); 95 msg = ralloc_vasprintf(mem_ctx, format, va); 96 va_end(va); 97 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg); 98 99 this->fail_msg = msg; 100 101 if (INTEL_DEBUG & DEBUG_WM) { 102 fprintf(stderr, "%s", msg); 103 } 104} 105 106void 107fs_visitor::push_force_uncompressed() 108{ 109 force_uncompressed_stack++; 110} 111 112void 113fs_visitor::pop_force_uncompressed() 114{ 115 force_uncompressed_stack--; 116 assert(force_uncompressed_stack >= 0); 117} 118 119void 120fs_visitor::push_force_sechalf() 121{ 122 force_sechalf_stack++; 123} 124 125void 126fs_visitor::pop_force_sechalf() 127{ 128 force_sechalf_stack--; 129 assert(force_sechalf_stack >= 0); 130} 131 132/** 133 * Returns how many MRFs an FS opcode will write over. 134 * 135 * Note that this is not the 0 or 1 implied writes in an actual gen 136 * instruction -- the FS opcodes often generate MOVs in addition. 137 */ 138int 139fs_visitor::implied_mrf_writes(fs_inst *inst) 140{ 141 if (inst->mlen == 0) 142 return 0; 143 144 switch (inst->opcode) { 145 case SHADER_OPCODE_RCP: 146 case SHADER_OPCODE_RSQ: 147 case SHADER_OPCODE_SQRT: 148 case SHADER_OPCODE_EXP2: 149 case SHADER_OPCODE_LOG2: 150 case SHADER_OPCODE_SIN: 151 case SHADER_OPCODE_COS: 152 return 1 * c->dispatch_width / 8; 153 case SHADER_OPCODE_POW: 154 case SHADER_OPCODE_INT_QUOTIENT: 155 case SHADER_OPCODE_INT_REMAINDER: 156 return 2 * c->dispatch_width / 8; 157 case SHADER_OPCODE_TEX: 158 case FS_OPCODE_TXB: 159 case SHADER_OPCODE_TXD: 160 case SHADER_OPCODE_TXF: 161 case SHADER_OPCODE_TXL: 162 case SHADER_OPCODE_TXS: 163 return 1; 164 case FS_OPCODE_FB_WRITE: 165 return 2; 166 case FS_OPCODE_PULL_CONSTANT_LOAD: 167 case FS_OPCODE_UNSPILL: 168 return 1; 169 case FS_OPCODE_SPILL: 170 return 2; 171 default: 172 assert(!"not reached"); 173 return inst->mlen; 174 } 175} 176 177int 178fs_visitor::virtual_grf_alloc(int size) 179{ 180 if (virtual_grf_array_size <= virtual_grf_next) { 181 if (virtual_grf_array_size == 0) 182 virtual_grf_array_size = 16; 183 else 184 virtual_grf_array_size *= 2; 185 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int, 186 virtual_grf_array_size); 187 } 188 virtual_grf_sizes[virtual_grf_next] = size; 189 return virtual_grf_next++; 190} 191 192/** Fixed HW reg constructor. */ 193fs_reg::fs_reg(enum register_file file, int reg) 194{ 195 init(); 196 this->file = file; 197 this->reg = reg; 198 this->type = BRW_REGISTER_TYPE_F; 199} 200 201/** Fixed HW reg constructor. */ 202fs_reg::fs_reg(enum register_file file, int reg, uint32_t type) 203{ 204 init(); 205 this->file = file; 206 this->reg = reg; 207 this->type = type; 208} 209 210/** Automatic reg constructor. */ 211fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type) 212{ 213 init(); 214 215 this->file = GRF; 216 this->reg = v->virtual_grf_alloc(v->type_size(type)); 217 this->reg_offset = 0; 218 this->type = brw_type_for_base_type(type); 219} 220 221fs_reg * 222fs_visitor::variable_storage(ir_variable *var) 223{ 224 return (fs_reg *)hash_table_find(this->variable_ht, var); 225} 226 227void 228import_uniforms_callback(const void *key, 229 void *data, 230 void *closure) 231{ 232 struct hash_table *dst_ht = (struct hash_table *)closure; 233 const fs_reg *reg = (const fs_reg *)data; 234 235 if (reg->file != UNIFORM) 236 return; 237 238 hash_table_insert(dst_ht, data, key); 239} 240 241/* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch. 242 * This brings in those uniform definitions 243 */ 244void 245fs_visitor::import_uniforms(fs_visitor *v) 246{ 247 hash_table_call_foreach(v->variable_ht, 248 import_uniforms_callback, 249 variable_ht); 250 this->params_remap = v->params_remap; 251} 252 253/* Our support for uniforms is piggy-backed on the struct 254 * gl_fragment_program, because that's where the values actually 255 * get stored, rather than in some global gl_shader_program uniform 256 * store. 257 */ 258int 259fs_visitor::setup_uniform_values(int loc, const glsl_type *type) 260{ 261 unsigned int offset = 0; 262 263 if (type->is_matrix()) { 264 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT, 265 type->vector_elements, 266 1); 267 268 for (unsigned int i = 0; i < type->matrix_columns; i++) { 269 offset += setup_uniform_values(loc + offset, column); 270 } 271 272 return offset; 273 } 274 275 switch (type->base_type) { 276 case GLSL_TYPE_FLOAT: 277 case GLSL_TYPE_UINT: 278 case GLSL_TYPE_INT: 279 case GLSL_TYPE_BOOL: 280 for (unsigned int i = 0; i < type->vector_elements; i++) { 281 unsigned int param = c->prog_data.nr_params++; 282 283 assert(param < ARRAY_SIZE(c->prog_data.param)); 284 285 if (ctx->Const.NativeIntegers) { 286 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 287 } else { 288 switch (type->base_type) { 289 case GLSL_TYPE_FLOAT: 290 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 291 break; 292 case GLSL_TYPE_UINT: 293 c->prog_data.param_convert[param] = PARAM_CONVERT_F2U; 294 break; 295 case GLSL_TYPE_INT: 296 c->prog_data.param_convert[param] = PARAM_CONVERT_F2I; 297 break; 298 case GLSL_TYPE_BOOL: 299 c->prog_data.param_convert[param] = PARAM_CONVERT_F2B; 300 break; 301 default: 302 assert(!"not reached"); 303 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 304 break; 305 } 306 } 307 this->param_index[param] = loc; 308 this->param_offset[param] = i; 309 } 310 return 1; 311 312 case GLSL_TYPE_STRUCT: 313 for (unsigned int i = 0; i < type->length; i++) { 314 offset += setup_uniform_values(loc + offset, 315 type->fields.structure[i].type); 316 } 317 return offset; 318 319 case GLSL_TYPE_ARRAY: 320 for (unsigned int i = 0; i < type->length; i++) { 321 offset += setup_uniform_values(loc + offset, type->fields.array); 322 } 323 return offset; 324 325 case GLSL_TYPE_SAMPLER: 326 /* The sampler takes up a slot, but we don't use any values from it. */ 327 return 1; 328 329 default: 330 assert(!"not reached"); 331 return 0; 332 } 333} 334 335 336/* Our support for builtin uniforms is even scarier than non-builtin. 337 * It sits on top of the PROG_STATE_VAR parameters that are 338 * automatically updated from GL context state. 339 */ 340void 341fs_visitor::setup_builtin_uniform_values(ir_variable *ir) 342{ 343 const ir_state_slot *const slots = ir->state_slots; 344 assert(ir->state_slots != NULL); 345 346 for (unsigned int i = 0; i < ir->num_state_slots; i++) { 347 /* This state reference has already been setup by ir_to_mesa, but we'll 348 * get the same index back here. 349 */ 350 int index = _mesa_add_state_reference(this->fp->Base.Parameters, 351 (gl_state_index *)slots[i].tokens); 352 353 /* Add each of the unique swizzles of the element as a parameter. 354 * This'll end up matching the expected layout of the 355 * array/matrix/structure we're trying to fill in. 356 */ 357 int last_swiz = -1; 358 for (unsigned int j = 0; j < 4; j++) { 359 int swiz = GET_SWZ(slots[i].swizzle, j); 360 if (swiz == last_swiz) 361 break; 362 last_swiz = swiz; 363 364 c->prog_data.param_convert[c->prog_data.nr_params] = 365 PARAM_NO_CONVERT; 366 this->param_index[c->prog_data.nr_params] = index; 367 this->param_offset[c->prog_data.nr_params] = swiz; 368 c->prog_data.nr_params++; 369 } 370 } 371} 372 373fs_reg * 374fs_visitor::emit_fragcoord_interpolation(ir_variable *ir) 375{ 376 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 377 fs_reg wpos = *reg; 378 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo; 379 380 /* gl_FragCoord.x */ 381 if (ir->pixel_center_integer) { 382 emit(BRW_OPCODE_MOV, wpos, this->pixel_x); 383 } else { 384 emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f)); 385 } 386 wpos.reg_offset++; 387 388 /* gl_FragCoord.y */ 389 if (!flip && ir->pixel_center_integer) { 390 emit(BRW_OPCODE_MOV, wpos, this->pixel_y); 391 } else { 392 fs_reg pixel_y = this->pixel_y; 393 float offset = (ir->pixel_center_integer ? 0.0 : 0.5); 394 395 if (flip) { 396 pixel_y.negate = true; 397 offset += c->key.drawable_height - 1.0; 398 } 399 400 emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset)); 401 } 402 wpos.reg_offset++; 403 404 /* gl_FragCoord.z */ 405 if (intel->gen >= 6) { 406 emit(BRW_OPCODE_MOV, wpos, 407 fs_reg(brw_vec8_grf(c->source_depth_reg, 0))); 408 } else { 409 emit(FS_OPCODE_LINTERP, wpos, 410 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], 411 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], 412 interp_reg(FRAG_ATTRIB_WPOS, 2)); 413 } 414 wpos.reg_offset++; 415 416 /* gl_FragCoord.w: Already set up in emit_interpolation */ 417 emit(BRW_OPCODE_MOV, wpos, this->wpos_w); 418 419 return reg; 420} 421 422fs_reg * 423fs_visitor::emit_general_interpolation(ir_variable *ir) 424{ 425 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 426 reg->type = brw_type_for_base_type(ir->type->get_scalar_type()); 427 fs_reg attr = *reg; 428 429 unsigned int array_elements; 430 const glsl_type *type; 431 432 if (ir->type->is_array()) { 433 array_elements = ir->type->length; 434 if (array_elements == 0) { 435 fail("dereferenced array '%s' has length 0\n", ir->name); 436 } 437 type = ir->type->fields.array; 438 } else { 439 array_elements = 1; 440 type = ir->type; 441 } 442 443 glsl_interp_qualifier interpolation_mode = 444 ir->determine_interpolation_mode(c->key.flat_shade); 445 446 int location = ir->location; 447 for (unsigned int i = 0; i < array_elements; i++) { 448 for (unsigned int j = 0; j < type->matrix_columns; j++) { 449 if (urb_setup[location] == -1) { 450 /* If there's no incoming setup data for this slot, don't 451 * emit interpolation for it. 452 */ 453 attr.reg_offset += type->vector_elements; 454 location++; 455 continue; 456 } 457 458 if (interpolation_mode == INTERP_QUALIFIER_FLAT) { 459 /* Constant interpolation (flat shading) case. The SF has 460 * handed us defined values in only the constant offset 461 * field of the setup reg. 462 */ 463 for (unsigned int k = 0; k < type->vector_elements; k++) { 464 struct brw_reg interp = interp_reg(location, k); 465 interp = suboffset(interp, 3); 466 interp.type = reg->type; 467 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp)); 468 attr.reg_offset++; 469 } 470 } else { 471 /* Smooth/noperspective interpolation case. */ 472 for (unsigned int k = 0; k < type->vector_elements; k++) { 473 /* FINISHME: At some point we probably want to push 474 * this farther by giving similar treatment to the 475 * other potentially constant components of the 476 * attribute, as well as making brw_vs_constval.c 477 * handle varyings other than gl_TexCoord. 478 */ 479 if (location >= FRAG_ATTRIB_TEX0 && 480 location <= FRAG_ATTRIB_TEX7 && 481 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) { 482 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f)); 483 } else { 484 struct brw_reg interp = interp_reg(location, k); 485 brw_wm_barycentric_interp_mode barycoord_mode; 486 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH) 487 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC; 488 else 489 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC; 490 emit(FS_OPCODE_LINTERP, attr, 491 this->delta_x[barycoord_mode], 492 this->delta_y[barycoord_mode], fs_reg(interp)); 493 if (intel->gen < 6) { 494 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w); 495 } 496 } 497 attr.reg_offset++; 498 } 499 500 } 501 location++; 502 } 503 } 504 505 return reg; 506} 507 508fs_reg * 509fs_visitor::emit_frontfacing_interpolation(ir_variable *ir) 510{ 511 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 512 513 /* The frontfacing comes in as a bit in the thread payload. */ 514 if (intel->gen >= 6) { 515 emit(BRW_OPCODE_ASR, *reg, 516 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)), 517 fs_reg(15)); 518 emit(BRW_OPCODE_NOT, *reg, *reg); 519 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1)); 520 } else { 521 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); 522 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives 523 * us front face 524 */ 525 fs_inst *inst = emit(BRW_OPCODE_CMP, *reg, 526 fs_reg(r1_6ud), 527 fs_reg(1u << 31)); 528 inst->conditional_mod = BRW_CONDITIONAL_L; 529 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u)); 530 } 531 532 return reg; 533} 534 535fs_inst * 536fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src) 537{ 538 switch (opcode) { 539 case SHADER_OPCODE_RCP: 540 case SHADER_OPCODE_RSQ: 541 case SHADER_OPCODE_SQRT: 542 case SHADER_OPCODE_EXP2: 543 case SHADER_OPCODE_LOG2: 544 case SHADER_OPCODE_SIN: 545 case SHADER_OPCODE_COS: 546 break; 547 default: 548 assert(!"not reached: bad math opcode"); 549 return NULL; 550 } 551 552 /* Can't do hstride == 0 args to gen6 math, so expand it out. We 553 * might be able to do better by doing execsize = 1 math and then 554 * expanding that result out, but we would need to be careful with 555 * masking. 556 * 557 * Gen 6 hardware ignores source modifiers (negate and abs) on math 558 * instructions, so we also move to a temp to set those up. 559 */ 560 if (intel->gen == 6 && (src.file == UNIFORM || 561 src.abs || 562 src.negate)) { 563 fs_reg expanded = fs_reg(this, glsl_type::float_type); 564 emit(BRW_OPCODE_MOV, expanded, src); 565 src = expanded; 566 } 567 568 fs_inst *inst = emit(opcode, dst, src); 569 570 if (intel->gen < 6) { 571 inst->base_mrf = 2; 572 inst->mlen = c->dispatch_width / 8; 573 } 574 575 return inst; 576} 577 578fs_inst * 579fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1) 580{ 581 int base_mrf = 2; 582 fs_inst *inst; 583 584 switch (opcode) { 585 case SHADER_OPCODE_POW: 586 case SHADER_OPCODE_INT_QUOTIENT: 587 case SHADER_OPCODE_INT_REMAINDER: 588 break; 589 default: 590 assert(!"not reached: unsupported binary math opcode."); 591 return NULL; 592 } 593 594 if (intel->gen >= 7) { 595 inst = emit(opcode, dst, src0, src1); 596 } else if (intel->gen == 6) { 597 /* Can't do hstride == 0 args to gen6 math, so expand it out. 598 * 599 * The hardware ignores source modifiers (negate and abs) on math 600 * instructions, so we also move to a temp to set those up. 601 */ 602 if (src0.file == UNIFORM || src0.abs || src0.negate) { 603 fs_reg expanded = fs_reg(this, glsl_type::float_type); 604 expanded.type = src0.type; 605 emit(BRW_OPCODE_MOV, expanded, src0); 606 src0 = expanded; 607 } 608 609 if (src1.file == UNIFORM || src1.abs || src1.negate) { 610 fs_reg expanded = fs_reg(this, glsl_type::float_type); 611 expanded.type = src1.type; 612 emit(BRW_OPCODE_MOV, expanded, src1); 613 src1 = expanded; 614 } 615 616 inst = emit(opcode, dst, src0, src1); 617 } else { 618 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13 619 * "Message Payload": 620 * 621 * "Operand0[7]. For the INT DIV functions, this operand is the 622 * denominator." 623 * ... 624 * "Operand1[7]. For the INT DIV functions, this operand is the 625 * numerator." 626 */ 627 bool is_int_div = opcode != SHADER_OPCODE_POW; 628 fs_reg &op0 = is_int_div ? src1 : src0; 629 fs_reg &op1 = is_int_div ? src0 : src1; 630 631 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1); 632 inst = emit(opcode, dst, op0, reg_null_f); 633 634 inst->base_mrf = base_mrf; 635 inst->mlen = 2 * c->dispatch_width / 8; 636 } 637 return inst; 638} 639 640/** 641 * To be called after the last _mesa_add_state_reference() call, to 642 * set up prog_data.param[] for assign_curb_setup() and 643 * setup_pull_constants(). 644 */ 645void 646fs_visitor::setup_paramvalues_refs() 647{ 648 if (c->dispatch_width != 8) 649 return; 650 651 /* Set up the pointers to ParamValues now that that array is finalized. */ 652 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 653 c->prog_data.param[i] = 654 (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] + 655 this->param_offset[i]; 656 } 657} 658 659void 660fs_visitor::assign_curb_setup() 661{ 662 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8; 663 if (c->dispatch_width == 8) { 664 c->prog_data.first_curbe_grf = c->nr_payload_regs; 665 } else { 666 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs; 667 } 668 669 /* Map the offsets in the UNIFORM file to fixed HW regs. */ 670 foreach_list(node, &this->instructions) { 671 fs_inst *inst = (fs_inst *)node; 672 673 for (unsigned int i = 0; i < 3; i++) { 674 if (inst->src[i].file == UNIFORM) { 675 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset; 676 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs + 677 constant_nr / 8, 678 constant_nr % 8); 679 680 inst->src[i].file = FIXED_HW_REG; 681 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type); 682 } 683 } 684 } 685} 686 687void 688fs_visitor::calculate_urb_setup() 689{ 690 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 691 urb_setup[i] = -1; 692 } 693 694 int urb_next = 0; 695 /* Figure out where each of the incoming setup attributes lands. */ 696 if (intel->gen >= 6) { 697 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 698 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) { 699 urb_setup[i] = urb_next++; 700 } 701 } 702 } else { 703 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */ 704 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) { 705 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) { 706 int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i); 707 708 if (fp_index >= 0) 709 urb_setup[fp_index] = urb_next++; 710 } 711 } 712 713 /* 714 * It's a FS only attribute, and we did interpolation for this attribute 715 * in SF thread. So, count it here, too. 716 * 717 * See compile_sf_prog() for more info. 718 */ 719 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC)) 720 urb_setup[FRAG_ATTRIB_PNTC] = urb_next++; 721 } 722 723 /* Each attribute is 4 setup channels, each of which is half a reg. */ 724 c->prog_data.urb_read_length = urb_next * 2; 725} 726 727void 728fs_visitor::assign_urb_setup() 729{ 730 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length; 731 732 /* Offset all the urb_setup[] index by the actual position of the 733 * setup regs, now that the location of the constants has been chosen. 734 */ 735 foreach_list(node, &this->instructions) { 736 fs_inst *inst = (fs_inst *)node; 737 738 if (inst->opcode == FS_OPCODE_LINTERP) { 739 assert(inst->src[2].file == FIXED_HW_REG); 740 inst->src[2].fixed_hw_reg.nr += urb_start; 741 } 742 743 if (inst->opcode == FS_OPCODE_CINTERP) { 744 assert(inst->src[0].file == FIXED_HW_REG); 745 inst->src[0].fixed_hw_reg.nr += urb_start; 746 } 747 } 748 749 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length; 750} 751 752/** 753 * Split large virtual GRFs into separate components if we can. 754 * 755 * This is mostly duplicated with what brw_fs_vector_splitting does, 756 * but that's really conservative because it's afraid of doing 757 * splitting that doesn't result in real progress after the rest of 758 * the optimization phases, which would cause infinite looping in 759 * optimization. We can do it once here, safely. This also has the 760 * opportunity to split interpolated values, or maybe even uniforms, 761 * which we don't have at the IR level. 762 * 763 * We want to split, because virtual GRFs are what we register 764 * allocate and spill (due to contiguousness requirements for some 765 * instructions), and they're what we naturally generate in the 766 * codegen process, but most virtual GRFs don't actually need to be 767 * contiguous sets of GRFs. If we split, we'll end up with reduced 768 * live intervals and better dead code elimination and coalescing. 769 */ 770void 771fs_visitor::split_virtual_grfs() 772{ 773 int num_vars = this->virtual_grf_next; 774 bool split_grf[num_vars]; 775 int new_virtual_grf[num_vars]; 776 777 /* Try to split anything > 0 sized. */ 778 for (int i = 0; i < num_vars; i++) { 779 if (this->virtual_grf_sizes[i] != 1) 780 split_grf[i] = true; 781 else 782 split_grf[i] = false; 783 } 784 785 if (brw->has_pln && 786 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) { 787 /* PLN opcodes rely on the delta_xy being contiguous. We only have to 788 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to 789 * Gen6, that was the only supported interpolation mode, and since Gen6, 790 * delta_x and delta_y are in fixed hardware registers. 791 */ 792 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] = 793 false; 794 } 795 796 foreach_list(node, &this->instructions) { 797 fs_inst *inst = (fs_inst *)node; 798 799 /* Texturing produces 4 contiguous registers, so no splitting. */ 800 if (inst->is_tex()) { 801 split_grf[inst->dst.reg] = false; 802 } 803 } 804 805 /* Allocate new space for split regs. Note that the virtual 806 * numbers will be contiguous. 807 */ 808 for (int i = 0; i < num_vars; i++) { 809 if (split_grf[i]) { 810 new_virtual_grf[i] = virtual_grf_alloc(1); 811 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) { 812 int reg = virtual_grf_alloc(1); 813 assert(reg == new_virtual_grf[i] + j - 1); 814 (void) reg; 815 } 816 this->virtual_grf_sizes[i] = 1; 817 } 818 } 819 820 foreach_list(node, &this->instructions) { 821 fs_inst *inst = (fs_inst *)node; 822 823 if (inst->dst.file == GRF && 824 split_grf[inst->dst.reg] && 825 inst->dst.reg_offset != 0) { 826 inst->dst.reg = (new_virtual_grf[inst->dst.reg] + 827 inst->dst.reg_offset - 1); 828 inst->dst.reg_offset = 0; 829 } 830 for (int i = 0; i < 3; i++) { 831 if (inst->src[i].file == GRF && 832 split_grf[inst->src[i].reg] && 833 inst->src[i].reg_offset != 0) { 834 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] + 835 inst->src[i].reg_offset - 1); 836 inst->src[i].reg_offset = 0; 837 } 838 } 839 } 840 this->live_intervals_valid = false; 841} 842 843bool 844fs_visitor::remove_dead_constants() 845{ 846 if (c->dispatch_width == 8) { 847 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params); 848 849 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) 850 this->params_remap[i] = -1; 851 852 /* Find which params are still in use. */ 853 foreach_list(node, &this->instructions) { 854 fs_inst *inst = (fs_inst *)node; 855 856 for (int i = 0; i < 3; i++) { 857 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset; 858 859 if (inst->src[i].file != UNIFORM) 860 continue; 861 862 assert(constant_nr < (int)c->prog_data.nr_params); 863 864 /* For now, set this to non-negative. We'll give it the 865 * actual new number in a moment, in order to keep the 866 * register numbers nicely ordered. 867 */ 868 this->params_remap[constant_nr] = 0; 869 } 870 } 871 872 /* Figure out what the new numbers for the params will be. At some 873 * point when we're doing uniform array access, we're going to want 874 * to keep the distinction between .reg and .reg_offset, but for 875 * now we don't care. 876 */ 877 unsigned int new_nr_params = 0; 878 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 879 if (this->params_remap[i] != -1) { 880 this->params_remap[i] = new_nr_params++; 881 } 882 } 883 884 /* Update the list of params to be uploaded to match our new numbering. */ 885 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 886 int remapped = this->params_remap[i]; 887 888 if (remapped == -1) 889 continue; 890 891 /* We've already done setup_paramvalues_refs() so no need to worry 892 * about param_index and param_offset. 893 */ 894 c->prog_data.param[remapped] = c->prog_data.param[i]; 895 c->prog_data.param_convert[remapped] = c->prog_data.param_convert[i]; 896 } 897 898 c->prog_data.nr_params = new_nr_params; 899 } else { 900 /* This should have been generated in the 8-wide pass already. */ 901 assert(this->params_remap); 902 } 903 904 /* Now do the renumbering of the shader to remove unused params. */ 905 foreach_list(node, &this->instructions) { 906 fs_inst *inst = (fs_inst *)node; 907 908 for (int i = 0; i < 3; i++) { 909 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset; 910 911 if (inst->src[i].file != UNIFORM) 912 continue; 913 914 assert(this->params_remap[constant_nr] != -1); 915 inst->src[i].reg = this->params_remap[constant_nr]; 916 inst->src[i].reg_offset = 0; 917 } 918 } 919 920 return true; 921} 922 923/** 924 * Choose accesses from the UNIFORM file to demote to using the pull 925 * constant buffer. 926 * 927 * We allow a fragment shader to have more than the specified minimum 928 * maximum number of fragment shader uniform components (64). If 929 * there are too many of these, they'd fill up all of register space. 930 * So, this will push some of them out to the pull constant buffer and 931 * update the program to load them. 932 */ 933void 934fs_visitor::setup_pull_constants() 935{ 936 /* Only allow 16 registers (128 uniform components) as push constants. */ 937 unsigned int max_uniform_components = 16 * 8; 938 if (c->prog_data.nr_params <= max_uniform_components) 939 return; 940 941 if (c->dispatch_width == 16) { 942 fail("Pull constants not supported in 16-wide\n"); 943 return; 944 } 945 946 /* Just demote the end of the list. We could probably do better 947 * here, demoting things that are rarely used in the program first. 948 */ 949 int pull_uniform_base = max_uniform_components; 950 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base; 951 952 foreach_list(node, &this->instructions) { 953 fs_inst *inst = (fs_inst *)node; 954 955 for (int i = 0; i < 3; i++) { 956 if (inst->src[i].file != UNIFORM) 957 continue; 958 959 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset; 960 if (uniform_nr < pull_uniform_base) 961 continue; 962 963 fs_reg dst = fs_reg(this, glsl_type::float_type); 964 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD, 965 dst); 966 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15; 967 pull->ir = inst->ir; 968 pull->annotation = inst->annotation; 969 pull->base_mrf = 14; 970 pull->mlen = 1; 971 972 inst->insert_before(pull); 973 974 inst->src[i].file = GRF; 975 inst->src[i].reg = dst.reg; 976 inst->src[i].reg_offset = 0; 977 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3; 978 } 979 } 980 981 for (int i = 0; i < pull_uniform_count; i++) { 982 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i]; 983 c->prog_data.pull_param_convert[i] = 984 c->prog_data.param_convert[pull_uniform_base + i]; 985 } 986 c->prog_data.nr_params -= pull_uniform_count; 987 c->prog_data.nr_pull_params = pull_uniform_count; 988} 989 990/** 991 * Attempts to move immediate constants into the immediate 992 * constant slot of following instructions. 993 * 994 * Immediate constants are a bit tricky -- they have to be in the last 995 * operand slot, you can't do abs/negate on them, 996 */ 997 998bool 999fs_visitor::propagate_constants() 1000{ 1001 bool progress = false; 1002 1003 calculate_live_intervals(); 1004 1005 foreach_list(node, &this->instructions) { 1006 fs_inst *inst = (fs_inst *)node; 1007 1008 if (inst->opcode != BRW_OPCODE_MOV || 1009 inst->predicated || 1010 inst->dst.file != GRF || inst->src[0].file != IMM || 1011 inst->dst.type != inst->src[0].type || 1012 (c->dispatch_width == 16 && 1013 (inst->force_uncompressed || inst->force_sechalf))) 1014 continue; 1015 1016 /* Don't bother with cases where we should have had the 1017 * operation on the constant folded in GLSL already. 1018 */ 1019 if (inst->saturate) 1020 continue; 1021 1022 /* Found a move of a constant to a GRF. Find anything else using the GRF 1023 * before it's written, and replace it with the constant if we can. 1024 */ 1025 for (fs_inst *scan_inst = (fs_inst *)inst->next; 1026 !scan_inst->is_tail_sentinel(); 1027 scan_inst = (fs_inst *)scan_inst->next) { 1028 if (scan_inst->opcode == BRW_OPCODE_DO || 1029 scan_inst->opcode == BRW_OPCODE_WHILE || 1030 scan_inst->opcode == BRW_OPCODE_ELSE || 1031 scan_inst->opcode == BRW_OPCODE_ENDIF) { 1032 break; 1033 } 1034 1035 for (int i = 2; i >= 0; i--) { 1036 if (scan_inst->src[i].file != GRF || 1037 scan_inst->src[i].reg != inst->dst.reg || 1038 scan_inst->src[i].reg_offset != inst->dst.reg_offset) 1039 continue; 1040 1041 /* Don't bother with cases where we should have had the 1042 * operation on the constant folded in GLSL already. 1043 */ 1044 if (scan_inst->src[i].negate || scan_inst->src[i].abs) 1045 continue; 1046 1047 switch (scan_inst->opcode) { 1048 case BRW_OPCODE_MOV: 1049 scan_inst->src[i] = inst->src[0]; 1050 progress = true; 1051 break; 1052 1053 case BRW_OPCODE_MUL: 1054 case BRW_OPCODE_ADD: 1055 if (i == 1) { 1056 scan_inst->src[i] = inst->src[0]; 1057 progress = true; 1058 } else if (i == 0 && scan_inst->src[1].file != IMM) { 1059 /* Fit this constant in by commuting the operands. 1060 * Exception: we can't do this for 32-bit integer MUL 1061 * because it's asymmetric. 1062 */ 1063 if (scan_inst->opcode == BRW_OPCODE_MUL && 1064 (scan_inst->src[1].type == BRW_REGISTER_TYPE_D || 1065 scan_inst->src[1].type == BRW_REGISTER_TYPE_UD)) 1066 break; 1067 scan_inst->src[0] = scan_inst->src[1]; 1068 scan_inst->src[1] = inst->src[0]; 1069 progress = true; 1070 } 1071 break; 1072 1073 case BRW_OPCODE_CMP: 1074 case BRW_OPCODE_IF: 1075 if (i == 1) { 1076 scan_inst->src[i] = inst->src[0]; 1077 progress = true; 1078 } else if (i == 0 && scan_inst->src[1].file != IMM) { 1079 uint32_t new_cmod; 1080 1081 new_cmod = brw_swap_cmod(scan_inst->conditional_mod); 1082 if (new_cmod != ~0u) { 1083 /* Fit this constant in by swapping the operands and 1084 * flipping the test 1085 */ 1086 scan_inst->src[0] = scan_inst->src[1]; 1087 scan_inst->src[1] = inst->src[0]; 1088 scan_inst->conditional_mod = new_cmod; 1089 progress = true; 1090 } 1091 } 1092 break; 1093 1094 case BRW_OPCODE_SEL: 1095 if (i == 1) { 1096 scan_inst->src[i] = inst->src[0]; 1097 progress = true; 1098 } else if (i == 0 && scan_inst->src[1].file != IMM) { 1099 scan_inst->src[0] = scan_inst->src[1]; 1100 scan_inst->src[1] = inst->src[0]; 1101 1102 /* If this was predicated, flipping operands means 1103 * we also need to flip the predicate. 1104 */ 1105 if (scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) { 1106 scan_inst->predicate_inverse = 1107 !scan_inst->predicate_inverse; 1108 } 1109 progress = true; 1110 } 1111 break; 1112 1113 case SHADER_OPCODE_RCP: 1114 /* The hardware doesn't do math on immediate values 1115 * (because why are you doing that, seriously?), but 1116 * the correct answer is to just constant fold it 1117 * anyway. 1118 */ 1119 assert(i == 0); 1120 if (inst->src[0].imm.f != 0.0f) { 1121 scan_inst->opcode = BRW_OPCODE_MOV; 1122 scan_inst->src[0] = inst->src[0]; 1123 scan_inst->src[0].imm.f = 1.0f / scan_inst->src[0].imm.f; 1124 progress = true; 1125 } 1126 break; 1127 1128 default: 1129 break; 1130 } 1131 } 1132 1133 if (scan_inst->dst.file == GRF && 1134 scan_inst->dst.reg == inst->dst.reg && 1135 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 1136 scan_inst->is_tex())) { 1137 break; 1138 } 1139 } 1140 } 1141 1142 if (progress) 1143 this->live_intervals_valid = false; 1144 1145 return progress; 1146} 1147 1148 1149/** 1150 * Attempts to move immediate constants into the immediate 1151 * constant slot of following instructions. 1152 * 1153 * Immediate constants are a bit tricky -- they have to be in the last 1154 * operand slot, you can't do abs/negate on them, 1155 */ 1156 1157bool 1158fs_visitor::opt_algebraic() 1159{ 1160 bool progress = false; 1161 1162 calculate_live_intervals(); 1163 1164 foreach_list(node, &this->instructions) { 1165 fs_inst *inst = (fs_inst *)node; 1166 1167 switch (inst->opcode) { 1168 case BRW_OPCODE_MUL: 1169 if (inst->src[1].file != IMM) 1170 continue; 1171 1172 /* a * 1.0 = a */ 1173 if (inst->src[1].type == BRW_REGISTER_TYPE_F && 1174 inst->src[1].imm.f == 1.0) { 1175 inst->opcode = BRW_OPCODE_MOV; 1176 inst->src[1] = reg_undef; 1177 progress = true; 1178 break; 1179 } 1180 1181 break; 1182 default: 1183 break; 1184 } 1185 } 1186 1187 return progress; 1188} 1189 1190/** 1191 * Must be called after calculate_live_intervales() to remove unused 1192 * writes to registers -- register allocation will fail otherwise 1193 * because something deffed but not used won't be considered to 1194 * interfere with other regs. 1195 */ 1196bool 1197fs_visitor::dead_code_eliminate() 1198{ 1199 bool progress = false; 1200 int pc = 0; 1201 1202 calculate_live_intervals(); 1203 1204 foreach_list_safe(node, &this->instructions) { 1205 fs_inst *inst = (fs_inst *)node; 1206 1207 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) { 1208 inst->remove(); 1209 progress = true; 1210 } 1211 1212 pc++; 1213 } 1214 1215 if (progress) 1216 live_intervals_valid = false; 1217 1218 return progress; 1219} 1220 1221/** 1222 * Implements a second type of register coalescing: This one checks if 1223 * the two regs involved in a raw move don't interfere, in which case 1224 * they can both by stored in the same place and the MOV removed. 1225 */ 1226bool 1227fs_visitor::register_coalesce_2() 1228{ 1229 bool progress = false; 1230 1231 calculate_live_intervals(); 1232 1233 foreach_list_safe(node, &this->instructions) { 1234 fs_inst *inst = (fs_inst *)node; 1235 1236 if (inst->opcode != BRW_OPCODE_MOV || 1237 inst->predicated || 1238 inst->saturate || 1239 inst->src[0].file != GRF || 1240 inst->src[0].negate || 1241 inst->src[0].abs || 1242 inst->src[0].smear != -1 || 1243 inst->dst.file != GRF || 1244 inst->dst.type != inst->src[0].type || 1245 virtual_grf_sizes[inst->src[0].reg] != 1 || 1246 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) { 1247 continue; 1248 } 1249 1250 int reg_from = inst->src[0].reg; 1251 assert(inst->src[0].reg_offset == 0); 1252 int reg_to = inst->dst.reg; 1253 int reg_to_offset = inst->dst.reg_offset; 1254 1255 foreach_list_safe(node, &this->instructions) { 1256 fs_inst *scan_inst = (fs_inst *)node; 1257 1258 if (scan_inst->dst.file == GRF && 1259 scan_inst->dst.reg == reg_from) { 1260 scan_inst->dst.reg = reg_to; 1261 scan_inst->dst.reg_offset = reg_to_offset; 1262 } 1263 for (int i = 0; i < 3; i++) { 1264 if (scan_inst->src[i].file == GRF && 1265 scan_inst->src[i].reg == reg_from) { 1266 scan_inst->src[i].reg = reg_to; 1267 scan_inst->src[i].reg_offset = reg_to_offset; 1268 } 1269 } 1270 } 1271 1272 inst->remove(); 1273 live_intervals_valid = false; 1274 progress = true; 1275 continue; 1276 } 1277 1278 return progress; 1279} 1280 1281bool 1282fs_visitor::register_coalesce() 1283{ 1284 bool progress = false; 1285 int if_depth = 0; 1286 int loop_depth = 0; 1287 1288 foreach_list_safe(node, &this->instructions) { 1289 fs_inst *inst = (fs_inst *)node; 1290 1291 /* Make sure that we dominate the instructions we're going to 1292 * scan for interfering with our coalescing, or we won't have 1293 * scanned enough to see if anything interferes with our 1294 * coalescing. We don't dominate the following instructions if 1295 * we're in a loop or an if block. 1296 */ 1297 switch (inst->opcode) { 1298 case BRW_OPCODE_DO: 1299 loop_depth++; 1300 break; 1301 case BRW_OPCODE_WHILE: 1302 loop_depth--; 1303 break; 1304 case BRW_OPCODE_IF: 1305 if_depth++; 1306 break; 1307 case BRW_OPCODE_ENDIF: 1308 if_depth--; 1309 break; 1310 default: 1311 break; 1312 } 1313 if (loop_depth || if_depth) 1314 continue; 1315 1316 if (inst->opcode != BRW_OPCODE_MOV || 1317 inst->predicated || 1318 inst->saturate || 1319 inst->dst.file != GRF || (inst->src[0].file != GRF && 1320 inst->src[0].file != UNIFORM)|| 1321 inst->dst.type != inst->src[0].type) 1322 continue; 1323 1324 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate; 1325 1326 /* Found a move of a GRF to a GRF. Let's see if we can coalesce 1327 * them: check for no writes to either one until the exit of the 1328 * program. 1329 */ 1330 bool interfered = false; 1331 1332 for (fs_inst *scan_inst = (fs_inst *)inst->next; 1333 !scan_inst->is_tail_sentinel(); 1334 scan_inst = (fs_inst *)scan_inst->next) { 1335 if (scan_inst->dst.file == GRF) { 1336 if (scan_inst->dst.reg == inst->dst.reg && 1337 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 1338 scan_inst->is_tex())) { 1339 interfered = true; 1340 break; 1341 } 1342 if (inst->src[0].file == GRF && 1343 scan_inst->dst.reg == inst->src[0].reg && 1344 (scan_inst->dst.reg_offset == inst->src[0].reg_offset || 1345 scan_inst->is_tex())) { 1346 interfered = true; 1347 break; 1348 } 1349 } 1350 1351 /* The gen6 MATH instruction can't handle source modifiers or 1352 * unusual register regions, so avoid coalescing those for 1353 * now. We should do something more specific. 1354 */ 1355 if (intel->gen >= 6 && 1356 scan_inst->is_math() && 1357 (has_source_modifiers || inst->src[0].file == UNIFORM)) { 1358 interfered = true; 1359 break; 1360 } 1361 1362 /* The accumulator result appears to get used for the 1363 * conditional modifier generation. When negating a UD 1364 * value, there is a 33rd bit generated for the sign in the 1365 * accumulator value, so now you can't check, for example, 1366 * equality with a 32-bit value. See piglit fs-op-neg-uint. 1367 */ 1368 if (scan_inst->conditional_mod && 1369 inst->src[0].negate && 1370 inst->src[0].type == BRW_REGISTER_TYPE_UD) { 1371 interfered = true; 1372 break; 1373 } 1374 } 1375 if (interfered) { 1376 continue; 1377 } 1378 1379 /* Rewrite the later usage to point at the source of the move to 1380 * be removed. 1381 */ 1382 for (fs_inst *scan_inst = inst; 1383 !scan_inst->is_tail_sentinel(); 1384 scan_inst = (fs_inst *)scan_inst->next) { 1385 for (int i = 0; i < 3; i++) { 1386 if (scan_inst->src[i].file == GRF && 1387 scan_inst->src[i].reg == inst->dst.reg && 1388 scan_inst->src[i].reg_offset == inst->dst.reg_offset) { 1389 fs_reg new_src = inst->src[0]; 1390 if (scan_inst->src[i].abs) { 1391 new_src.negate = 0; 1392 new_src.abs = 1; 1393 } 1394 new_src.negate ^= scan_inst->src[i].negate; 1395 scan_inst->src[i] = new_src; 1396 } 1397 } 1398 } 1399 1400 inst->remove(); 1401 progress = true; 1402 } 1403 1404 if (progress) 1405 live_intervals_valid = false; 1406 1407 return progress; 1408} 1409 1410 1411bool 1412fs_visitor::compute_to_mrf() 1413{ 1414 bool progress = false; 1415 int next_ip = 0; 1416 1417 calculate_live_intervals(); 1418 1419 foreach_list_safe(node, &this->instructions) { 1420 fs_inst *inst = (fs_inst *)node; 1421 1422 int ip = next_ip; 1423 next_ip++; 1424 1425 if (inst->opcode != BRW_OPCODE_MOV || 1426 inst->predicated || 1427 inst->dst.file != MRF || inst->src[0].file != GRF || 1428 inst->dst.type != inst->src[0].type || 1429 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1) 1430 continue; 1431 1432 /* Work out which hardware MRF registers are written by this 1433 * instruction. 1434 */ 1435 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4; 1436 int mrf_high; 1437 if (inst->dst.reg & BRW_MRF_COMPR4) { 1438 mrf_high = mrf_low + 4; 1439 } else if (c->dispatch_width == 16 && 1440 (!inst->force_uncompressed && !inst->force_sechalf)) { 1441 mrf_high = mrf_low + 1; 1442 } else { 1443 mrf_high = mrf_low; 1444 } 1445 1446 /* Can't compute-to-MRF this GRF if someone else was going to 1447 * read it later. 1448 */ 1449 if (this->virtual_grf_use[inst->src[0].reg] > ip) 1450 continue; 1451 1452 /* Found a move of a GRF to a MRF. Let's see if we can go 1453 * rewrite the thing that made this GRF to write into the MRF. 1454 */ 1455 fs_inst *scan_inst; 1456 for (scan_inst = (fs_inst *)inst->prev; 1457 scan_inst->prev != NULL; 1458 scan_inst = (fs_inst *)scan_inst->prev) { 1459 if (scan_inst->dst.file == GRF && 1460 scan_inst->dst.reg == inst->src[0].reg) { 1461 /* Found the last thing to write our reg we want to turn 1462 * into a compute-to-MRF. 1463 */ 1464 1465 if (scan_inst->is_tex()) { 1466 /* texturing writes several continuous regs, so we can't 1467 * compute-to-mrf that. 1468 */ 1469 break; 1470 } 1471 1472 /* If it's predicated, it (probably) didn't populate all 1473 * the channels. We might be able to rewrite everything 1474 * that writes that reg, but it would require smarter 1475 * tracking to delay the rewriting until complete success. 1476 */ 1477 if (scan_inst->predicated) 1478 break; 1479 1480 /* If it's half of register setup and not the same half as 1481 * our MOV we're trying to remove, bail for now. 1482 */ 1483 if (scan_inst->force_uncompressed != inst->force_uncompressed || 1484 scan_inst->force_sechalf != inst->force_sechalf) { 1485 break; 1486 } 1487 1488 /* SEND instructions can't have MRF as a destination. */ 1489 if (scan_inst->mlen) 1490 break; 1491 1492 if (intel->gen >= 6) { 1493 /* gen6 math instructions must have the destination be 1494 * GRF, so no compute-to-MRF for them. 1495 */ 1496 if (scan_inst->is_math()) { 1497 break; 1498 } 1499 } 1500 1501 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 1502 /* Found the creator of our MRF's source value. */ 1503 scan_inst->dst.file = MRF; 1504 scan_inst->dst.reg = inst->dst.reg; 1505 scan_inst->saturate |= inst->saturate; 1506 inst->remove(); 1507 progress = true; 1508 } 1509 break; 1510 } 1511 1512 /* We don't handle flow control here. Most computation of 1513 * values that end up in MRFs are shortly before the MRF 1514 * write anyway. 1515 */ 1516 if (scan_inst->opcode == BRW_OPCODE_DO || 1517 scan_inst->opcode == BRW_OPCODE_WHILE || 1518 scan_inst->opcode == BRW_OPCODE_ELSE || 1519 scan_inst->opcode == BRW_OPCODE_ENDIF) { 1520 break; 1521 } 1522 1523 /* You can't read from an MRF, so if someone else reads our 1524 * MRF's source GRF that we wanted to rewrite, that stops us. 1525 */ 1526 bool interfered = false; 1527 for (int i = 0; i < 3; i++) { 1528 if (scan_inst->src[i].file == GRF && 1529 scan_inst->src[i].reg == inst->src[0].reg && 1530 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { 1531 interfered = true; 1532 } 1533 } 1534 if (interfered) 1535 break; 1536 1537 if (scan_inst->dst.file == MRF) { 1538 /* If somebody else writes our MRF here, we can't 1539 * compute-to-MRF before that. 1540 */ 1541 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4; 1542 int scan_mrf_high; 1543 1544 if (scan_inst->dst.reg & BRW_MRF_COMPR4) { 1545 scan_mrf_high = scan_mrf_low + 4; 1546 } else if (c->dispatch_width == 16 && 1547 (!scan_inst->force_uncompressed && 1548 !scan_inst->force_sechalf)) { 1549 scan_mrf_high = scan_mrf_low + 1; 1550 } else { 1551 scan_mrf_high = scan_mrf_low; 1552 } 1553 1554 if (mrf_low == scan_mrf_low || 1555 mrf_low == scan_mrf_high || 1556 mrf_high == scan_mrf_low || 1557 mrf_high == scan_mrf_high) { 1558 break; 1559 } 1560 } 1561 1562 if (scan_inst->mlen > 0) { 1563 /* Found a SEND instruction, which means that there are 1564 * live values in MRFs from base_mrf to base_mrf + 1565 * scan_inst->mlen - 1. Don't go pushing our MRF write up 1566 * above it. 1567 */ 1568 if (mrf_low >= scan_inst->base_mrf && 1569 mrf_low < scan_inst->base_mrf + scan_inst->mlen) { 1570 break; 1571 } 1572 if (mrf_high >= scan_inst->base_mrf && 1573 mrf_high < scan_inst->base_mrf + scan_inst->mlen) { 1574 break; 1575 } 1576 } 1577 } 1578 } 1579 1580 return progress; 1581} 1582 1583/** 1584 * Walks through basic blocks, looking for repeated MRF writes and 1585 * removing the later ones. 1586 */ 1587bool 1588fs_visitor::remove_duplicate_mrf_writes() 1589{ 1590 fs_inst *last_mrf_move[16]; 1591 bool progress = false; 1592 1593 /* Need to update the MRF tracking for compressed instructions. */ 1594 if (c->dispatch_width == 16) 1595 return false; 1596 1597 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 1598 1599 foreach_list_safe(node, &this->instructions) { 1600 fs_inst *inst = (fs_inst *)node; 1601 1602 switch (inst->opcode) { 1603 case BRW_OPCODE_DO: 1604 case BRW_OPCODE_WHILE: 1605 case BRW_OPCODE_IF: 1606 case BRW_OPCODE_ELSE: 1607 case BRW_OPCODE_ENDIF: 1608 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 1609 continue; 1610 default: 1611 break; 1612 } 1613 1614 if (inst->opcode == BRW_OPCODE_MOV && 1615 inst->dst.file == MRF) { 1616 fs_inst *prev_inst = last_mrf_move[inst->dst.reg]; 1617 if (prev_inst && inst->equals(prev_inst)) { 1618 inst->remove(); 1619 progress = true; 1620 continue; 1621 } 1622 } 1623 1624 /* Clear out the last-write records for MRFs that were overwritten. */ 1625 if (inst->dst.file == MRF) { 1626 last_mrf_move[inst->dst.reg] = NULL; 1627 } 1628 1629 if (inst->mlen > 0) { 1630 /* Found a SEND instruction, which will include two or fewer 1631 * implied MRF writes. We could do better here. 1632 */ 1633 for (int i = 0; i < implied_mrf_writes(inst); i++) { 1634 last_mrf_move[inst->base_mrf + i] = NULL; 1635 } 1636 } 1637 1638 /* Clear out any MRF move records whose sources got overwritten. */ 1639 if (inst->dst.file == GRF) { 1640 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) { 1641 if (last_mrf_move[i] && 1642 last_mrf_move[i]->src[0].reg == inst->dst.reg) { 1643 last_mrf_move[i] = NULL; 1644 } 1645 } 1646 } 1647 1648 if (inst->opcode == BRW_OPCODE_MOV && 1649 inst->dst.file == MRF && 1650 inst->src[0].file == GRF && 1651 !inst->predicated) { 1652 last_mrf_move[inst->dst.reg] = inst; 1653 } 1654 } 1655 1656 return progress; 1657} 1658 1659/** 1660 * Possibly returns an instruction that set up @param reg. 1661 * 1662 * Sometimes we want to take the result of some expression/variable 1663 * dereference tree and rewrite the instruction generating the result 1664 * of the tree. When processing the tree, we know that the 1665 * instructions generated are all writing temporaries that are dead 1666 * outside of this tree. So, if we have some instructions that write 1667 * a temporary, we're free to point that temp write somewhere else. 1668 * 1669 * Note that this doesn't guarantee that the instruction generated 1670 * only reg -- it might be the size=4 destination of a texture instruction. 1671 */ 1672fs_inst * 1673fs_visitor::get_instruction_generating_reg(fs_inst *start, 1674 fs_inst *end, 1675 fs_reg reg) 1676{ 1677 if (end == start || 1678 end->predicated || 1679 end->force_uncompressed || 1680 end->force_sechalf || 1681 !reg.equals(end->dst)) { 1682 return NULL; 1683 } else { 1684 return end; 1685 } 1686} 1687 1688bool 1689fs_visitor::run() 1690{ 1691 uint32_t prog_offset_16 = 0; 1692 uint32_t orig_nr_params = c->prog_data.nr_params; 1693 1694 brw_wm_payload_setup(brw, c); 1695 1696 if (c->dispatch_width == 16) { 1697 /* align to 64 byte boundary. */ 1698 while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) { 1699 brw_NOP(p); 1700 } 1701 1702 /* Save off the start of this 16-wide program in case we succeed. */ 1703 prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction); 1704 1705 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 1706 } 1707 1708 if (0) { 1709 emit_dummy_fs(); 1710 } else { 1711 calculate_urb_setup(); 1712 if (intel->gen < 6) 1713 emit_interpolation_setup_gen4(); 1714 else 1715 emit_interpolation_setup_gen6(); 1716 1717 /* Generate FS IR for main(). (the visitor only descends into 1718 * functions called "main"). 1719 */ 1720 foreach_list(node, &*shader->ir) { 1721 ir_instruction *ir = (ir_instruction *)node; 1722 base_ir = ir; 1723 this->result = reg_undef; 1724 ir->accept(this); 1725 } 1726 if (failed) 1727 return false; 1728 1729 emit_fb_writes(); 1730 1731 split_virtual_grfs(); 1732 1733 setup_paramvalues_refs(); 1734 setup_pull_constants(); 1735 1736 bool progress; 1737 do { 1738 progress = false; 1739 1740 progress = remove_duplicate_mrf_writes() || progress; 1741 1742 progress = propagate_constants() || progress; 1743 progress = opt_algebraic() || progress; 1744 progress = opt_cse() || progress; 1745 progress = opt_copy_propagate() || progress; 1746 progress = register_coalesce() || progress; 1747 progress = register_coalesce_2() || progress; 1748 progress = compute_to_mrf() || progress; 1749 progress = dead_code_eliminate() || progress; 1750 } while (progress); 1751 1752 remove_dead_constants(); 1753 1754 schedule_instructions(); 1755 1756 assign_curb_setup(); 1757 assign_urb_setup(); 1758 1759 if (0) { 1760 /* Debug of register spilling: Go spill everything. */ 1761 int virtual_grf_count = virtual_grf_next; 1762 for (int i = 0; i < virtual_grf_count; i++) { 1763 spill_reg(i); 1764 } 1765 } 1766 1767 if (0) 1768 assign_regs_trivial(); 1769 else { 1770 while (!assign_regs()) { 1771 if (failed) 1772 break; 1773 } 1774 } 1775 } 1776 assert(force_uncompressed_stack == 0); 1777 assert(force_sechalf_stack == 0); 1778 1779 if (failed) 1780 return false; 1781 1782 generate_code(); 1783 1784 if (c->dispatch_width == 8) { 1785 c->prog_data.reg_blocks = brw_register_blocks(grf_used); 1786 } else { 1787 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used); 1788 c->prog_data.prog_offset_16 = prog_offset_16; 1789 1790 /* Make sure we didn't try to sneak in an extra uniform */ 1791 assert(orig_nr_params == c->prog_data.nr_params); 1792 (void) orig_nr_params; 1793 } 1794 1795 return !failed; 1796} 1797 1798bool 1799brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c, 1800 struct gl_shader_program *prog) 1801{ 1802 struct intel_context *intel = &brw->intel; 1803 1804 if (!prog) 1805 return false; 1806 1807 struct brw_shader *shader = 1808 (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; 1809 if (!shader) 1810 return false; 1811 1812 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 1813 printf("GLSL IR for native fragment shader %d:\n", prog->Name); 1814 _mesa_print_ir(shader->ir, NULL); 1815 printf("\n\n"); 1816 } 1817 1818 /* Now the main event: Visit the shader IR and generate our FS IR for it. 1819 */ 1820 c->dispatch_width = 8; 1821 1822 fs_visitor v(c, prog, shader); 1823 if (!v.run()) { 1824 prog->LinkStatus = false; 1825 ralloc_strcat(&prog->InfoLog, v.fail_msg); 1826 1827 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n", 1828 v.fail_msg); 1829 1830 return false; 1831 } 1832 1833 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) { 1834 c->dispatch_width = 16; 1835 fs_visitor v2(c, prog, shader); 1836 v2.import_uniforms(&v); 1837 v2.run(); 1838 } 1839 1840 c->prog_data.dispatch_width = 8; 1841 1842 return true; 1843} 1844 1845bool 1846brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog) 1847{ 1848 struct brw_context *brw = brw_context(ctx); 1849 struct brw_wm_prog_key key; 1850 1851 /* As a temporary measure we assume that all programs use dFdy() (and hence 1852 * need to be compiled differently depending on whether we're rendering to 1853 * an FBO). FIXME: set this bool correctly based on the contents of the 1854 * program. 1855 */ 1856 bool program_uses_dfdy = true; 1857 1858 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT]) 1859 return true; 1860 1861 struct gl_fragment_program *fp = (struct gl_fragment_program *) 1862 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program; 1863 struct brw_fragment_program *bfp = brw_fragment_program(fp); 1864 1865 memset(&key, 0, sizeof(key)); 1866 1867 if (fp->UsesKill) 1868 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT; 1869 1870 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) 1871 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT; 1872 1873 /* Just assume depth testing. */ 1874 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT; 1875 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT; 1876 1877 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS); 1878 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) { 1879 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i))) 1880 continue; 1881 1882 key.proj_attrib_mask |= 1 << i; 1883 1884 int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i); 1885 1886 if (vp_index >= 0) 1887 key.vp_outputs_written |= BITFIELD64_BIT(vp_index); 1888 } 1889 1890 key.clamp_fragment_color = true; 1891 1892 for (int i = 0; i < BRW_MAX_TEX_UNIT; i++) { 1893 if (fp->Base.ShadowSamplers & (1 << i)) 1894 key.tex.compare_funcs[i] = GL_LESS; 1895 1896 /* FINISHME: depth compares might use (0,0,0,W) for example */ 1897 key.tex.swizzles[i] = SWIZZLE_XYZW; 1898 } 1899 1900 if (fp->Base.InputsRead & FRAG_BIT_WPOS) { 1901 key.drawable_height = ctx->DrawBuffer->Height; 1902 } 1903 1904 if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) { 1905 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer); 1906 } 1907 1908 key.nr_color_regions = 1; 1909 1910 key.program_string_id = bfp->id; 1911 1912 uint32_t old_prog_offset = brw->wm.prog_offset; 1913 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data; 1914 1915 bool success = do_wm_prog(brw, prog, bfp, &key); 1916 1917 brw->wm.prog_offset = old_prog_offset; 1918 brw->wm.prog_data = old_prog_data; 1919 1920 return success; 1921} 1922