brw_fs.cpp revision 069901e2f5a8f4a58047d25335f2526f1acc7234
1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24/** @file brw_fs.cpp 25 * 26 * This file drives the GLSL IR -> LIR translation, contains the 27 * optimizations on the LIR, and drives the generation of native code 28 * from the LIR. 29 */ 30 31extern "C" { 32 33#include <sys/types.h> 34 35#include "main/macros.h" 36#include "main/shaderobj.h" 37#include "main/uniforms.h" 38#include "program/prog_parameter.h" 39#include "program/prog_print.h" 40#include "program/register_allocate.h" 41#include "program/sampler.h" 42#include "program/hash_table.h" 43#include "brw_context.h" 44#include "brw_eu.h" 45#include "brw_wm.h" 46} 47#include "brw_shader.h" 48#include "brw_fs.h" 49#include "glsl/glsl_types.h" 50#include "glsl/ir_print_visitor.h" 51 52#define MAX_INSTRUCTION (1 << 30) 53 54int 55fs_visitor::type_size(const struct glsl_type *type) 56{ 57 unsigned int size, i; 58 59 switch (type->base_type) { 60 case GLSL_TYPE_UINT: 61 case GLSL_TYPE_INT: 62 case GLSL_TYPE_FLOAT: 63 case GLSL_TYPE_BOOL: 64 return type->components(); 65 case GLSL_TYPE_ARRAY: 66 return type_size(type->fields.array) * type->length; 67 case GLSL_TYPE_STRUCT: 68 size = 0; 69 for (i = 0; i < type->length; i++) { 70 size += type_size(type->fields.structure[i].type); 71 } 72 return size; 73 case GLSL_TYPE_SAMPLER: 74 /* Samplers take up no register space, since they're baked in at 75 * link time. 76 */ 77 return 0; 78 default: 79 assert(!"not reached"); 80 return 0; 81 } 82} 83 84void 85fs_visitor::fail(const char *format, ...) 86{ 87 va_list va; 88 char *msg; 89 90 if (failed) 91 return; 92 93 failed = true; 94 95 va_start(va, format); 96 msg = ralloc_vasprintf(mem_ctx, format, va); 97 va_end(va); 98 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg); 99 100 this->fail_msg = msg; 101 102 if (INTEL_DEBUG & DEBUG_WM) { 103 fprintf(stderr, "%s", msg); 104 } 105} 106 107void 108fs_visitor::push_force_uncompressed() 109{ 110 force_uncompressed_stack++; 111} 112 113void 114fs_visitor::pop_force_uncompressed() 115{ 116 force_uncompressed_stack--; 117 assert(force_uncompressed_stack >= 0); 118} 119 120void 121fs_visitor::push_force_sechalf() 122{ 123 force_sechalf_stack++; 124} 125 126void 127fs_visitor::pop_force_sechalf() 128{ 129 force_sechalf_stack--; 130 assert(force_sechalf_stack >= 0); 131} 132 133/** 134 * Returns how many MRFs an FS opcode will write over. 135 * 136 * Note that this is not the 0 or 1 implied writes in an actual gen 137 * instruction -- the FS opcodes often generate MOVs in addition. 138 */ 139int 140fs_visitor::implied_mrf_writes(fs_inst *inst) 141{ 142 if (inst->mlen == 0) 143 return 0; 144 145 switch (inst->opcode) { 146 case SHADER_OPCODE_RCP: 147 case SHADER_OPCODE_RSQ: 148 case SHADER_OPCODE_SQRT: 149 case SHADER_OPCODE_EXP2: 150 case SHADER_OPCODE_LOG2: 151 case SHADER_OPCODE_SIN: 152 case SHADER_OPCODE_COS: 153 return 1 * c->dispatch_width / 8; 154 case SHADER_OPCODE_POW: 155 case SHADER_OPCODE_INT_QUOTIENT: 156 case SHADER_OPCODE_INT_REMAINDER: 157 return 2 * c->dispatch_width / 8; 158 case SHADER_OPCODE_TEX: 159 case FS_OPCODE_TXB: 160 case SHADER_OPCODE_TXD: 161 case SHADER_OPCODE_TXF: 162 case SHADER_OPCODE_TXL: 163 case SHADER_OPCODE_TXS: 164 return 1; 165 case FS_OPCODE_FB_WRITE: 166 return 2; 167 case FS_OPCODE_PULL_CONSTANT_LOAD: 168 case FS_OPCODE_UNSPILL: 169 return 1; 170 case FS_OPCODE_SPILL: 171 return 2; 172 default: 173 assert(!"not reached"); 174 return inst->mlen; 175 } 176} 177 178int 179fs_visitor::virtual_grf_alloc(int size) 180{ 181 if (virtual_grf_array_size <= virtual_grf_next) { 182 if (virtual_grf_array_size == 0) 183 virtual_grf_array_size = 16; 184 else 185 virtual_grf_array_size *= 2; 186 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int, 187 virtual_grf_array_size); 188 } 189 virtual_grf_sizes[virtual_grf_next] = size; 190 return virtual_grf_next++; 191} 192 193/** Fixed HW reg constructor. */ 194fs_reg::fs_reg(enum register_file file, int reg) 195{ 196 init(); 197 this->file = file; 198 this->reg = reg; 199 this->type = BRW_REGISTER_TYPE_F; 200} 201 202/** Fixed HW reg constructor. */ 203fs_reg::fs_reg(enum register_file file, int reg, uint32_t type) 204{ 205 init(); 206 this->file = file; 207 this->reg = reg; 208 this->type = type; 209} 210 211/** Automatic reg constructor. */ 212fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type) 213{ 214 init(); 215 216 this->file = GRF; 217 this->reg = v->virtual_grf_alloc(v->type_size(type)); 218 this->reg_offset = 0; 219 this->type = brw_type_for_base_type(type); 220} 221 222fs_reg * 223fs_visitor::variable_storage(ir_variable *var) 224{ 225 return (fs_reg *)hash_table_find(this->variable_ht, var); 226} 227 228void 229import_uniforms_callback(const void *key, 230 void *data, 231 void *closure) 232{ 233 struct hash_table *dst_ht = (struct hash_table *)closure; 234 const fs_reg *reg = (const fs_reg *)data; 235 236 if (reg->file != UNIFORM) 237 return; 238 239 hash_table_insert(dst_ht, data, key); 240} 241 242/* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch. 243 * This brings in those uniform definitions 244 */ 245void 246fs_visitor::import_uniforms(fs_visitor *v) 247{ 248 hash_table_call_foreach(v->variable_ht, 249 import_uniforms_callback, 250 variable_ht); 251 this->params_remap = v->params_remap; 252} 253 254/* Our support for uniforms is piggy-backed on the struct 255 * gl_fragment_program, because that's where the values actually 256 * get stored, rather than in some global gl_shader_program uniform 257 * store. 258 */ 259int 260fs_visitor::setup_uniform_values(int loc, const glsl_type *type) 261{ 262 unsigned int offset = 0; 263 264 if (type->is_matrix()) { 265 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT, 266 type->vector_elements, 267 1); 268 269 for (unsigned int i = 0; i < type->matrix_columns; i++) { 270 offset += setup_uniform_values(loc + offset, column); 271 } 272 273 return offset; 274 } 275 276 switch (type->base_type) { 277 case GLSL_TYPE_FLOAT: 278 case GLSL_TYPE_UINT: 279 case GLSL_TYPE_INT: 280 case GLSL_TYPE_BOOL: 281 for (unsigned int i = 0; i < type->vector_elements; i++) { 282 unsigned int param = c->prog_data.nr_params++; 283 284 assert(param < ARRAY_SIZE(c->prog_data.param)); 285 286 if (ctx->Const.NativeIntegers) { 287 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 288 } else { 289 switch (type->base_type) { 290 case GLSL_TYPE_FLOAT: 291 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 292 break; 293 case GLSL_TYPE_UINT: 294 c->prog_data.param_convert[param] = PARAM_CONVERT_F2U; 295 break; 296 case GLSL_TYPE_INT: 297 c->prog_data.param_convert[param] = PARAM_CONVERT_F2I; 298 break; 299 case GLSL_TYPE_BOOL: 300 c->prog_data.param_convert[param] = PARAM_CONVERT_F2B; 301 break; 302 default: 303 assert(!"not reached"); 304 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 305 break; 306 } 307 } 308 this->param_index[param] = loc; 309 this->param_offset[param] = i; 310 } 311 return 1; 312 313 case GLSL_TYPE_STRUCT: 314 for (unsigned int i = 0; i < type->length; i++) { 315 offset += setup_uniform_values(loc + offset, 316 type->fields.structure[i].type); 317 } 318 return offset; 319 320 case GLSL_TYPE_ARRAY: 321 for (unsigned int i = 0; i < type->length; i++) { 322 offset += setup_uniform_values(loc + offset, type->fields.array); 323 } 324 return offset; 325 326 case GLSL_TYPE_SAMPLER: 327 /* The sampler takes up a slot, but we don't use any values from it. */ 328 return 1; 329 330 default: 331 assert(!"not reached"); 332 return 0; 333 } 334} 335 336 337/* Our support for builtin uniforms is even scarier than non-builtin. 338 * It sits on top of the PROG_STATE_VAR parameters that are 339 * automatically updated from GL context state. 340 */ 341void 342fs_visitor::setup_builtin_uniform_values(ir_variable *ir) 343{ 344 const ir_state_slot *const slots = ir->state_slots; 345 assert(ir->state_slots != NULL); 346 347 for (unsigned int i = 0; i < ir->num_state_slots; i++) { 348 /* This state reference has already been setup by ir_to_mesa, but we'll 349 * get the same index back here. 350 */ 351 int index = _mesa_add_state_reference(this->fp->Base.Parameters, 352 (gl_state_index *)slots[i].tokens); 353 354 /* Add each of the unique swizzles of the element as a parameter. 355 * This'll end up matching the expected layout of the 356 * array/matrix/structure we're trying to fill in. 357 */ 358 int last_swiz = -1; 359 for (unsigned int j = 0; j < 4; j++) { 360 int swiz = GET_SWZ(slots[i].swizzle, j); 361 if (swiz == last_swiz) 362 break; 363 last_swiz = swiz; 364 365 c->prog_data.param_convert[c->prog_data.nr_params] = 366 PARAM_NO_CONVERT; 367 this->param_index[c->prog_data.nr_params] = index; 368 this->param_offset[c->prog_data.nr_params] = swiz; 369 c->prog_data.nr_params++; 370 } 371 } 372} 373 374fs_reg * 375fs_visitor::emit_fragcoord_interpolation(ir_variable *ir) 376{ 377 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 378 fs_reg wpos = *reg; 379 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo; 380 381 /* gl_FragCoord.x */ 382 if (ir->pixel_center_integer) { 383 emit(BRW_OPCODE_MOV, wpos, this->pixel_x); 384 } else { 385 emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f)); 386 } 387 wpos.reg_offset++; 388 389 /* gl_FragCoord.y */ 390 if (!flip && ir->pixel_center_integer) { 391 emit(BRW_OPCODE_MOV, wpos, this->pixel_y); 392 } else { 393 fs_reg pixel_y = this->pixel_y; 394 float offset = (ir->pixel_center_integer ? 0.0 : 0.5); 395 396 if (flip) { 397 pixel_y.negate = true; 398 offset += c->key.drawable_height - 1.0; 399 } 400 401 emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset)); 402 } 403 wpos.reg_offset++; 404 405 /* gl_FragCoord.z */ 406 if (intel->gen >= 6) { 407 emit(BRW_OPCODE_MOV, wpos, 408 fs_reg(brw_vec8_grf(c->source_depth_reg, 0))); 409 } else { 410 emit(FS_OPCODE_LINTERP, wpos, 411 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], 412 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], 413 interp_reg(FRAG_ATTRIB_WPOS, 2)); 414 } 415 wpos.reg_offset++; 416 417 /* gl_FragCoord.w: Already set up in emit_interpolation */ 418 emit(BRW_OPCODE_MOV, wpos, this->wpos_w); 419 420 return reg; 421} 422 423fs_reg * 424fs_visitor::emit_general_interpolation(ir_variable *ir) 425{ 426 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 427 reg->type = brw_type_for_base_type(ir->type->get_scalar_type()); 428 fs_reg attr = *reg; 429 430 unsigned int array_elements; 431 const glsl_type *type; 432 433 if (ir->type->is_array()) { 434 array_elements = ir->type->length; 435 if (array_elements == 0) { 436 fail("dereferenced array '%s' has length 0\n", ir->name); 437 } 438 type = ir->type->fields.array; 439 } else { 440 array_elements = 1; 441 type = ir->type; 442 } 443 444 glsl_interp_qualifier interpolation_mode = 445 ir->determine_interpolation_mode(c->key.flat_shade); 446 447 int location = ir->location; 448 for (unsigned int i = 0; i < array_elements; i++) { 449 for (unsigned int j = 0; j < type->matrix_columns; j++) { 450 if (urb_setup[location] == -1) { 451 /* If there's no incoming setup data for this slot, don't 452 * emit interpolation for it. 453 */ 454 attr.reg_offset += type->vector_elements; 455 location++; 456 continue; 457 } 458 459 if (interpolation_mode == INTERP_QUALIFIER_FLAT) { 460 /* Constant interpolation (flat shading) case. The SF has 461 * handed us defined values in only the constant offset 462 * field of the setup reg. 463 */ 464 for (unsigned int k = 0; k < type->vector_elements; k++) { 465 struct brw_reg interp = interp_reg(location, k); 466 interp = suboffset(interp, 3); 467 interp.type = reg->type; 468 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp)); 469 attr.reg_offset++; 470 } 471 } else { 472 /* Smooth/noperspective interpolation case. */ 473 for (unsigned int k = 0; k < type->vector_elements; k++) { 474 /* FINISHME: At some point we probably want to push 475 * this farther by giving similar treatment to the 476 * other potentially constant components of the 477 * attribute, as well as making brw_vs_constval.c 478 * handle varyings other than gl_TexCoord. 479 */ 480 if (location >= FRAG_ATTRIB_TEX0 && 481 location <= FRAG_ATTRIB_TEX7 && 482 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) { 483 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f)); 484 } else { 485 struct brw_reg interp = interp_reg(location, k); 486 brw_wm_barycentric_interp_mode barycoord_mode; 487 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH) 488 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC; 489 else 490 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC; 491 emit(FS_OPCODE_LINTERP, attr, 492 this->delta_x[barycoord_mode], 493 this->delta_y[barycoord_mode], fs_reg(interp)); 494 } 495 attr.reg_offset++; 496 } 497 498 if (intel->gen < 6) { 499 attr.reg_offset -= type->vector_elements; 500 for (unsigned int k = 0; k < type->vector_elements; k++) { 501 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w); 502 attr.reg_offset++; 503 } 504 } 505 } 506 location++; 507 } 508 } 509 510 return reg; 511} 512 513fs_reg * 514fs_visitor::emit_frontfacing_interpolation(ir_variable *ir) 515{ 516 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 517 518 /* The frontfacing comes in as a bit in the thread payload. */ 519 if (intel->gen >= 6) { 520 emit(BRW_OPCODE_ASR, *reg, 521 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)), 522 fs_reg(15)); 523 emit(BRW_OPCODE_NOT, *reg, *reg); 524 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1)); 525 } else { 526 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); 527 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives 528 * us front face 529 */ 530 fs_inst *inst = emit(BRW_OPCODE_CMP, *reg, 531 fs_reg(r1_6ud), 532 fs_reg(1u << 31)); 533 inst->conditional_mod = BRW_CONDITIONAL_L; 534 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u)); 535 } 536 537 return reg; 538} 539 540fs_inst * 541fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src) 542{ 543 switch (opcode) { 544 case SHADER_OPCODE_RCP: 545 case SHADER_OPCODE_RSQ: 546 case SHADER_OPCODE_SQRT: 547 case SHADER_OPCODE_EXP2: 548 case SHADER_OPCODE_LOG2: 549 case SHADER_OPCODE_SIN: 550 case SHADER_OPCODE_COS: 551 break; 552 default: 553 assert(!"not reached: bad math opcode"); 554 return NULL; 555 } 556 557 /* Can't do hstride == 0 args to gen6 math, so expand it out. We 558 * might be able to do better by doing execsize = 1 math and then 559 * expanding that result out, but we would need to be careful with 560 * masking. 561 * 562 * Gen 6 hardware ignores source modifiers (negate and abs) on math 563 * instructions, so we also move to a temp to set those up. 564 */ 565 if (intel->gen == 6 && (src.file == UNIFORM || 566 src.abs || 567 src.negate)) { 568 fs_reg expanded = fs_reg(this, glsl_type::float_type); 569 emit(BRW_OPCODE_MOV, expanded, src); 570 src = expanded; 571 } 572 573 fs_inst *inst = emit(opcode, dst, src); 574 575 if (intel->gen < 6) { 576 inst->base_mrf = 2; 577 inst->mlen = c->dispatch_width / 8; 578 } 579 580 return inst; 581} 582 583fs_inst * 584fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1) 585{ 586 int base_mrf = 2; 587 fs_inst *inst; 588 589 switch (opcode) { 590 case SHADER_OPCODE_POW: 591 case SHADER_OPCODE_INT_QUOTIENT: 592 case SHADER_OPCODE_INT_REMAINDER: 593 break; 594 default: 595 assert(!"not reached: unsupported binary math opcode."); 596 return NULL; 597 } 598 599 if (intel->gen >= 7) { 600 inst = emit(opcode, dst, src0, src1); 601 } else if (intel->gen == 6) { 602 /* Can't do hstride == 0 args to gen6 math, so expand it out. 603 * 604 * The hardware ignores source modifiers (negate and abs) on math 605 * instructions, so we also move to a temp to set those up. 606 */ 607 if (src0.file == UNIFORM || src0.abs || src0.negate) { 608 fs_reg expanded = fs_reg(this, glsl_type::float_type); 609 expanded.type = src0.type; 610 emit(BRW_OPCODE_MOV, expanded, src0); 611 src0 = expanded; 612 } 613 614 if (src1.file == UNIFORM || src1.abs || src1.negate) { 615 fs_reg expanded = fs_reg(this, glsl_type::float_type); 616 expanded.type = src1.type; 617 emit(BRW_OPCODE_MOV, expanded, src1); 618 src1 = expanded; 619 } 620 621 inst = emit(opcode, dst, src0, src1); 622 } else { 623 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13 624 * "Message Payload": 625 * 626 * "Operand0[7]. For the INT DIV functions, this operand is the 627 * denominator." 628 * ... 629 * "Operand1[7]. For the INT DIV functions, this operand is the 630 * numerator." 631 */ 632 bool is_int_div = opcode != SHADER_OPCODE_POW; 633 fs_reg &op0 = is_int_div ? src1 : src0; 634 fs_reg &op1 = is_int_div ? src0 : src1; 635 636 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1); 637 inst = emit(opcode, dst, op0, reg_null_f); 638 639 inst->base_mrf = base_mrf; 640 inst->mlen = 2 * c->dispatch_width / 8; 641 } 642 return inst; 643} 644 645/** 646 * To be called after the last _mesa_add_state_reference() call, to 647 * set up prog_data.param[] for assign_curb_setup() and 648 * setup_pull_constants(). 649 */ 650void 651fs_visitor::setup_paramvalues_refs() 652{ 653 if (c->dispatch_width != 8) 654 return; 655 656 /* Set up the pointers to ParamValues now that that array is finalized. */ 657 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 658 c->prog_data.param[i] = 659 (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] + 660 this->param_offset[i]; 661 } 662} 663 664void 665fs_visitor::assign_curb_setup() 666{ 667 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8; 668 if (c->dispatch_width == 8) { 669 c->prog_data.first_curbe_grf = c->nr_payload_regs; 670 } else { 671 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs; 672 } 673 674 /* Map the offsets in the UNIFORM file to fixed HW regs. */ 675 foreach_list(node, &this->instructions) { 676 fs_inst *inst = (fs_inst *)node; 677 678 for (unsigned int i = 0; i < 3; i++) { 679 if (inst->src[i].file == UNIFORM) { 680 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset; 681 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs + 682 constant_nr / 8, 683 constant_nr % 8); 684 685 inst->src[i].file = FIXED_HW_REG; 686 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type); 687 } 688 } 689 } 690} 691 692void 693fs_visitor::calculate_urb_setup() 694{ 695 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 696 urb_setup[i] = -1; 697 } 698 699 int urb_next = 0; 700 /* Figure out where each of the incoming setup attributes lands. */ 701 if (intel->gen >= 6) { 702 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 703 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) { 704 urb_setup[i] = urb_next++; 705 } 706 } 707 } else { 708 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */ 709 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) { 710 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) { 711 int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i); 712 713 if (fp_index >= 0) 714 urb_setup[fp_index] = urb_next++; 715 } 716 } 717 } 718 719 /* Each attribute is 4 setup channels, each of which is half a reg. */ 720 c->prog_data.urb_read_length = urb_next * 2; 721} 722 723void 724fs_visitor::assign_urb_setup() 725{ 726 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length; 727 728 /* Offset all the urb_setup[] index by the actual position of the 729 * setup regs, now that the location of the constants has been chosen. 730 */ 731 foreach_list(node, &this->instructions) { 732 fs_inst *inst = (fs_inst *)node; 733 734 if (inst->opcode == FS_OPCODE_LINTERP) { 735 assert(inst->src[2].file == FIXED_HW_REG); 736 inst->src[2].fixed_hw_reg.nr += urb_start; 737 } 738 739 if (inst->opcode == FS_OPCODE_CINTERP) { 740 assert(inst->src[0].file == FIXED_HW_REG); 741 inst->src[0].fixed_hw_reg.nr += urb_start; 742 } 743 } 744 745 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length; 746} 747 748/** 749 * Split large virtual GRFs into separate components if we can. 750 * 751 * This is mostly duplicated with what brw_fs_vector_splitting does, 752 * but that's really conservative because it's afraid of doing 753 * splitting that doesn't result in real progress after the rest of 754 * the optimization phases, which would cause infinite looping in 755 * optimization. We can do it once here, safely. This also has the 756 * opportunity to split interpolated values, or maybe even uniforms, 757 * which we don't have at the IR level. 758 * 759 * We want to split, because virtual GRFs are what we register 760 * allocate and spill (due to contiguousness requirements for some 761 * instructions), and they're what we naturally generate in the 762 * codegen process, but most virtual GRFs don't actually need to be 763 * contiguous sets of GRFs. If we split, we'll end up with reduced 764 * live intervals and better dead code elimination and coalescing. 765 */ 766void 767fs_visitor::split_virtual_grfs() 768{ 769 int num_vars = this->virtual_grf_next; 770 bool split_grf[num_vars]; 771 int new_virtual_grf[num_vars]; 772 773 /* Try to split anything > 0 sized. */ 774 for (int i = 0; i < num_vars; i++) { 775 if (this->virtual_grf_sizes[i] != 1) 776 split_grf[i] = true; 777 else 778 split_grf[i] = false; 779 } 780 781 if (brw->has_pln && 782 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) { 783 /* PLN opcodes rely on the delta_xy being contiguous. We only have to 784 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to 785 * Gen6, that was the only supported interpolation mode, and since Gen6, 786 * delta_x and delta_y are in fixed hardware registers. 787 */ 788 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] = 789 false; 790 } 791 792 foreach_list(node, &this->instructions) { 793 fs_inst *inst = (fs_inst *)node; 794 795 /* Texturing produces 4 contiguous registers, so no splitting. */ 796 if (inst->is_tex()) { 797 split_grf[inst->dst.reg] = false; 798 } 799 } 800 801 /* Allocate new space for split regs. Note that the virtual 802 * numbers will be contiguous. 803 */ 804 for (int i = 0; i < num_vars; i++) { 805 if (split_grf[i]) { 806 new_virtual_grf[i] = virtual_grf_alloc(1); 807 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) { 808 int reg = virtual_grf_alloc(1); 809 assert(reg == new_virtual_grf[i] + j - 1); 810 (void) reg; 811 } 812 this->virtual_grf_sizes[i] = 1; 813 } 814 } 815 816 foreach_list(node, &this->instructions) { 817 fs_inst *inst = (fs_inst *)node; 818 819 if (inst->dst.file == GRF && 820 split_grf[inst->dst.reg] && 821 inst->dst.reg_offset != 0) { 822 inst->dst.reg = (new_virtual_grf[inst->dst.reg] + 823 inst->dst.reg_offset - 1); 824 inst->dst.reg_offset = 0; 825 } 826 for (int i = 0; i < 3; i++) { 827 if (inst->src[i].file == GRF && 828 split_grf[inst->src[i].reg] && 829 inst->src[i].reg_offset != 0) { 830 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] + 831 inst->src[i].reg_offset - 1); 832 inst->src[i].reg_offset = 0; 833 } 834 } 835 } 836 this->live_intervals_valid = false; 837} 838 839bool 840fs_visitor::remove_dead_constants() 841{ 842 if (c->dispatch_width == 8) { 843 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params); 844 845 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) 846 this->params_remap[i] = -1; 847 848 /* Find which params are still in use. */ 849 foreach_list(node, &this->instructions) { 850 fs_inst *inst = (fs_inst *)node; 851 852 for (int i = 0; i < 3; i++) { 853 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset; 854 855 if (inst->src[i].file != UNIFORM) 856 continue; 857 858 assert(constant_nr < (int)c->prog_data.nr_params); 859 860 /* For now, set this to non-negative. We'll give it the 861 * actual new number in a moment, in order to keep the 862 * register numbers nicely ordered. 863 */ 864 this->params_remap[constant_nr] = 0; 865 } 866 } 867 868 /* Figure out what the new numbers for the params will be. At some 869 * point when we're doing uniform array access, we're going to want 870 * to keep the distinction between .reg and .reg_offset, but for 871 * now we don't care. 872 */ 873 unsigned int new_nr_params = 0; 874 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 875 if (this->params_remap[i] != -1) { 876 this->params_remap[i] = new_nr_params++; 877 } 878 } 879 880 /* Update the list of params to be uploaded to match our new numbering. */ 881 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 882 int remapped = this->params_remap[i]; 883 884 if (remapped == -1) 885 continue; 886 887 /* We've already done setup_paramvalues_refs() so no need to worry 888 * about param_index and param_offset. 889 */ 890 c->prog_data.param[remapped] = c->prog_data.param[i]; 891 c->prog_data.param_convert[remapped] = c->prog_data.param_convert[i]; 892 } 893 894 c->prog_data.nr_params = new_nr_params; 895 } else { 896 /* This should have been generated in the 8-wide pass already. */ 897 assert(this->params_remap); 898 } 899 900 /* Now do the renumbering of the shader to remove unused params. */ 901 foreach_list(node, &this->instructions) { 902 fs_inst *inst = (fs_inst *)node; 903 904 for (int i = 0; i < 3; i++) { 905 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset; 906 907 if (inst->src[i].file != UNIFORM) 908 continue; 909 910 assert(this->params_remap[constant_nr] != -1); 911 inst->src[i].reg = this->params_remap[constant_nr]; 912 inst->src[i].reg_offset = 0; 913 } 914 } 915 916 return true; 917} 918 919/** 920 * Choose accesses from the UNIFORM file to demote to using the pull 921 * constant buffer. 922 * 923 * We allow a fragment shader to have more than the specified minimum 924 * maximum number of fragment shader uniform components (64). If 925 * there are too many of these, they'd fill up all of register space. 926 * So, this will push some of them out to the pull constant buffer and 927 * update the program to load them. 928 */ 929void 930fs_visitor::setup_pull_constants() 931{ 932 /* Only allow 16 registers (128 uniform components) as push constants. */ 933 unsigned int max_uniform_components = 16 * 8; 934 if (c->prog_data.nr_params <= max_uniform_components) 935 return; 936 937 if (c->dispatch_width == 16) { 938 fail("Pull constants not supported in 16-wide\n"); 939 return; 940 } 941 942 /* Just demote the end of the list. We could probably do better 943 * here, demoting things that are rarely used in the program first. 944 */ 945 int pull_uniform_base = max_uniform_components; 946 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base; 947 948 foreach_list(node, &this->instructions) { 949 fs_inst *inst = (fs_inst *)node; 950 951 for (int i = 0; i < 3; i++) { 952 if (inst->src[i].file != UNIFORM) 953 continue; 954 955 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset; 956 if (uniform_nr < pull_uniform_base) 957 continue; 958 959 fs_reg dst = fs_reg(this, glsl_type::float_type); 960 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD, 961 dst); 962 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15; 963 pull->ir = inst->ir; 964 pull->annotation = inst->annotation; 965 pull->base_mrf = 14; 966 pull->mlen = 1; 967 968 inst->insert_before(pull); 969 970 inst->src[i].file = GRF; 971 inst->src[i].reg = dst.reg; 972 inst->src[i].reg_offset = 0; 973 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3; 974 } 975 } 976 977 for (int i = 0; i < pull_uniform_count; i++) { 978 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i]; 979 c->prog_data.pull_param_convert[i] = 980 c->prog_data.param_convert[pull_uniform_base + i]; 981 } 982 c->prog_data.nr_params -= pull_uniform_count; 983 c->prog_data.nr_pull_params = pull_uniform_count; 984} 985 986void 987fs_visitor::calculate_live_intervals() 988{ 989 int num_vars = this->virtual_grf_next; 990 int *def = ralloc_array(mem_ctx, int, num_vars); 991 int *use = ralloc_array(mem_ctx, int, num_vars); 992 int loop_depth = 0; 993 int loop_start = 0; 994 995 if (this->live_intervals_valid) 996 return; 997 998 for (int i = 0; i < num_vars; i++) { 999 def[i] = MAX_INSTRUCTION; 1000 use[i] = -1; 1001 } 1002 1003 int ip = 0; 1004 foreach_list(node, &this->instructions) { 1005 fs_inst *inst = (fs_inst *)node; 1006 1007 if (inst->opcode == BRW_OPCODE_DO) { 1008 if (loop_depth++ == 0) 1009 loop_start = ip; 1010 } else if (inst->opcode == BRW_OPCODE_WHILE) { 1011 loop_depth--; 1012 1013 if (loop_depth == 0) { 1014 /* Patches up the use of vars marked for being live across 1015 * the whole loop. 1016 */ 1017 for (int i = 0; i < num_vars; i++) { 1018 if (use[i] == loop_start) { 1019 use[i] = ip; 1020 } 1021 } 1022 } 1023 } else { 1024 for (unsigned int i = 0; i < 3; i++) { 1025 if (inst->src[i].file == GRF) { 1026 int reg = inst->src[i].reg; 1027 1028 if (!loop_depth) { 1029 use[reg] = ip; 1030 } else { 1031 def[reg] = MIN2(loop_start, def[reg]); 1032 use[reg] = loop_start; 1033 1034 /* Nobody else is going to go smash our start to 1035 * later in the loop now, because def[reg] now 1036 * points before the bb header. 1037 */ 1038 } 1039 } 1040 } 1041 if (inst->dst.file == GRF) { 1042 int reg = inst->dst.reg; 1043 1044 if (!loop_depth) { 1045 def[reg] = MIN2(def[reg], ip); 1046 } else { 1047 def[reg] = MIN2(def[reg], loop_start); 1048 } 1049 } 1050 } 1051 1052 ip++; 1053 } 1054 1055 ralloc_free(this->virtual_grf_def); 1056 ralloc_free(this->virtual_grf_use); 1057 this->virtual_grf_def = def; 1058 this->virtual_grf_use = use; 1059 1060 this->live_intervals_valid = true; 1061} 1062 1063/** 1064 * Attempts to move immediate constants into the immediate 1065 * constant slot of following instructions. 1066 * 1067 * Immediate constants are a bit tricky -- they have to be in the last 1068 * operand slot, you can't do abs/negate on them, 1069 */ 1070 1071bool 1072fs_visitor::propagate_constants() 1073{ 1074 bool progress = false; 1075 1076 calculate_live_intervals(); 1077 1078 foreach_list(node, &this->instructions) { 1079 fs_inst *inst = (fs_inst *)node; 1080 1081 if (inst->opcode != BRW_OPCODE_MOV || 1082 inst->predicated || 1083 inst->dst.file != GRF || inst->src[0].file != IMM || 1084 inst->dst.type != inst->src[0].type || 1085 (c->dispatch_width == 16 && 1086 (inst->force_uncompressed || inst->force_sechalf))) 1087 continue; 1088 1089 /* Don't bother with cases where we should have had the 1090 * operation on the constant folded in GLSL already. 1091 */ 1092 if (inst->saturate) 1093 continue; 1094 1095 /* Found a move of a constant to a GRF. Find anything else using the GRF 1096 * before it's written, and replace it with the constant if we can. 1097 */ 1098 for (fs_inst *scan_inst = (fs_inst *)inst->next; 1099 !scan_inst->is_tail_sentinel(); 1100 scan_inst = (fs_inst *)scan_inst->next) { 1101 if (scan_inst->opcode == BRW_OPCODE_DO || 1102 scan_inst->opcode == BRW_OPCODE_WHILE || 1103 scan_inst->opcode == BRW_OPCODE_ELSE || 1104 scan_inst->opcode == BRW_OPCODE_ENDIF) { 1105 break; 1106 } 1107 1108 for (int i = 2; i >= 0; i--) { 1109 if (scan_inst->src[i].file != GRF || 1110 scan_inst->src[i].reg != inst->dst.reg || 1111 scan_inst->src[i].reg_offset != inst->dst.reg_offset) 1112 continue; 1113 1114 /* Don't bother with cases where we should have had the 1115 * operation on the constant folded in GLSL already. 1116 */ 1117 if (scan_inst->src[i].negate || scan_inst->src[i].abs) 1118 continue; 1119 1120 switch (scan_inst->opcode) { 1121 case BRW_OPCODE_MOV: 1122 scan_inst->src[i] = inst->src[0]; 1123 progress = true; 1124 break; 1125 1126 case BRW_OPCODE_MUL: 1127 case BRW_OPCODE_ADD: 1128 if (i == 1) { 1129 scan_inst->src[i] = inst->src[0]; 1130 progress = true; 1131 } else if (i == 0 && scan_inst->src[1].file != IMM) { 1132 /* Fit this constant in by commuting the operands. 1133 * Exception: we can't do this for 32-bit integer MUL 1134 * because it's asymmetric. 1135 */ 1136 if (scan_inst->opcode == BRW_OPCODE_MUL && 1137 (scan_inst->src[1].type == BRW_REGISTER_TYPE_D || 1138 scan_inst->src[1].type == BRW_REGISTER_TYPE_UD)) 1139 break; 1140 scan_inst->src[0] = scan_inst->src[1]; 1141 scan_inst->src[1] = inst->src[0]; 1142 progress = true; 1143 } 1144 break; 1145 1146 case BRW_OPCODE_CMP: 1147 case BRW_OPCODE_IF: 1148 if (i == 1) { 1149 scan_inst->src[i] = inst->src[0]; 1150 progress = true; 1151 } else if (i == 0 && scan_inst->src[1].file != IMM) { 1152 uint32_t new_cmod; 1153 1154 new_cmod = brw_swap_cmod(scan_inst->conditional_mod); 1155 if (new_cmod != ~0u) { 1156 /* Fit this constant in by swapping the operands and 1157 * flipping the test 1158 */ 1159 scan_inst->src[0] = scan_inst->src[1]; 1160 scan_inst->src[1] = inst->src[0]; 1161 scan_inst->conditional_mod = new_cmod; 1162 progress = true; 1163 } 1164 } 1165 break; 1166 1167 case BRW_OPCODE_SEL: 1168 if (i == 1) { 1169 scan_inst->src[i] = inst->src[0]; 1170 progress = true; 1171 } else if (i == 0 && scan_inst->src[1].file != IMM) { 1172 scan_inst->src[0] = scan_inst->src[1]; 1173 scan_inst->src[1] = inst->src[0]; 1174 1175 /* If this was predicated, flipping operands means 1176 * we also need to flip the predicate. 1177 */ 1178 if (scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) { 1179 scan_inst->predicate_inverse = 1180 !scan_inst->predicate_inverse; 1181 } 1182 progress = true; 1183 } 1184 break; 1185 1186 case SHADER_OPCODE_RCP: 1187 /* The hardware doesn't do math on immediate values 1188 * (because why are you doing that, seriously?), but 1189 * the correct answer is to just constant fold it 1190 * anyway. 1191 */ 1192 assert(i == 0); 1193 if (inst->src[0].imm.f != 0.0f) { 1194 scan_inst->opcode = BRW_OPCODE_MOV; 1195 scan_inst->src[0] = inst->src[0]; 1196 scan_inst->src[0].imm.f = 1.0f / scan_inst->src[0].imm.f; 1197 progress = true; 1198 } 1199 break; 1200 1201 default: 1202 break; 1203 } 1204 } 1205 1206 if (scan_inst->dst.file == GRF && 1207 scan_inst->dst.reg == inst->dst.reg && 1208 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 1209 scan_inst->is_tex())) { 1210 break; 1211 } 1212 } 1213 } 1214 1215 if (progress) 1216 this->live_intervals_valid = false; 1217 1218 return progress; 1219} 1220 1221 1222/** 1223 * Attempts to move immediate constants into the immediate 1224 * constant slot of following instructions. 1225 * 1226 * Immediate constants are a bit tricky -- they have to be in the last 1227 * operand slot, you can't do abs/negate on them, 1228 */ 1229 1230bool 1231fs_visitor::opt_algebraic() 1232{ 1233 bool progress = false; 1234 1235 calculate_live_intervals(); 1236 1237 foreach_list(node, &this->instructions) { 1238 fs_inst *inst = (fs_inst *)node; 1239 1240 switch (inst->opcode) { 1241 case BRW_OPCODE_MUL: 1242 if (inst->src[1].file != IMM) 1243 continue; 1244 1245 /* a * 1.0 = a */ 1246 if (inst->src[1].type == BRW_REGISTER_TYPE_F && 1247 inst->src[1].imm.f == 1.0) { 1248 inst->opcode = BRW_OPCODE_MOV; 1249 inst->src[1] = reg_undef; 1250 progress = true; 1251 break; 1252 } 1253 1254 break; 1255 default: 1256 break; 1257 } 1258 } 1259 1260 return progress; 1261} 1262 1263/** 1264 * Must be called after calculate_live_intervales() to remove unused 1265 * writes to registers -- register allocation will fail otherwise 1266 * because something deffed but not used won't be considered to 1267 * interfere with other regs. 1268 */ 1269bool 1270fs_visitor::dead_code_eliminate() 1271{ 1272 bool progress = false; 1273 int pc = 0; 1274 1275 calculate_live_intervals(); 1276 1277 foreach_list_safe(node, &this->instructions) { 1278 fs_inst *inst = (fs_inst *)node; 1279 1280 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) { 1281 inst->remove(); 1282 progress = true; 1283 } 1284 1285 pc++; 1286 } 1287 1288 if (progress) 1289 live_intervals_valid = false; 1290 1291 return progress; 1292} 1293 1294bool 1295fs_visitor::register_coalesce() 1296{ 1297 bool progress = false; 1298 int if_depth = 0; 1299 int loop_depth = 0; 1300 1301 foreach_list_safe(node, &this->instructions) { 1302 fs_inst *inst = (fs_inst *)node; 1303 1304 /* Make sure that we dominate the instructions we're going to 1305 * scan for interfering with our coalescing, or we won't have 1306 * scanned enough to see if anything interferes with our 1307 * coalescing. We don't dominate the following instructions if 1308 * we're in a loop or an if block. 1309 */ 1310 switch (inst->opcode) { 1311 case BRW_OPCODE_DO: 1312 loop_depth++; 1313 break; 1314 case BRW_OPCODE_WHILE: 1315 loop_depth--; 1316 break; 1317 case BRW_OPCODE_IF: 1318 if_depth++; 1319 break; 1320 case BRW_OPCODE_ENDIF: 1321 if_depth--; 1322 break; 1323 default: 1324 break; 1325 } 1326 if (loop_depth || if_depth) 1327 continue; 1328 1329 if (inst->opcode != BRW_OPCODE_MOV || 1330 inst->predicated || 1331 inst->saturate || 1332 inst->dst.file != GRF || (inst->src[0].file != GRF && 1333 inst->src[0].file != UNIFORM)|| 1334 inst->dst.type != inst->src[0].type) 1335 continue; 1336 1337 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate; 1338 1339 /* Found a move of a GRF to a GRF. Let's see if we can coalesce 1340 * them: check for no writes to either one until the exit of the 1341 * program. 1342 */ 1343 bool interfered = false; 1344 1345 for (fs_inst *scan_inst = (fs_inst *)inst->next; 1346 !scan_inst->is_tail_sentinel(); 1347 scan_inst = (fs_inst *)scan_inst->next) { 1348 if (scan_inst->dst.file == GRF) { 1349 if (scan_inst->dst.reg == inst->dst.reg && 1350 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 1351 scan_inst->is_tex())) { 1352 interfered = true; 1353 break; 1354 } 1355 if (inst->src[0].file == GRF && 1356 scan_inst->dst.reg == inst->src[0].reg && 1357 (scan_inst->dst.reg_offset == inst->src[0].reg_offset || 1358 scan_inst->is_tex())) { 1359 interfered = true; 1360 break; 1361 } 1362 } 1363 1364 /* The gen6 MATH instruction can't handle source modifiers or 1365 * unusual register regions, so avoid coalescing those for 1366 * now. We should do something more specific. 1367 */ 1368 if (intel->gen >= 6 && 1369 scan_inst->is_math() && 1370 (has_source_modifiers || inst->src[0].file == UNIFORM)) { 1371 interfered = true; 1372 break; 1373 } 1374 1375 /* The accumulator result appears to get used for the 1376 * conditional modifier generation. When negating a UD 1377 * value, there is a 33rd bit generated for the sign in the 1378 * accumulator value, so now you can't check, for example, 1379 * equality with a 32-bit value. See piglit fs-op-neg-uint. 1380 */ 1381 if (scan_inst->conditional_mod && 1382 inst->src[0].negate && 1383 inst->src[0].type == BRW_REGISTER_TYPE_UD) { 1384 interfered = true; 1385 break; 1386 } 1387 } 1388 if (interfered) { 1389 continue; 1390 } 1391 1392 /* Rewrite the later usage to point at the source of the move to 1393 * be removed. 1394 */ 1395 for (fs_inst *scan_inst = inst; 1396 !scan_inst->is_tail_sentinel(); 1397 scan_inst = (fs_inst *)scan_inst->next) { 1398 for (int i = 0; i < 3; i++) { 1399 if (scan_inst->src[i].file == GRF && 1400 scan_inst->src[i].reg == inst->dst.reg && 1401 scan_inst->src[i].reg_offset == inst->dst.reg_offset) { 1402 fs_reg new_src = inst->src[0]; 1403 if (scan_inst->src[i].abs) { 1404 new_src.negate = 0; 1405 new_src.abs = 1; 1406 } 1407 new_src.negate ^= scan_inst->src[i].negate; 1408 scan_inst->src[i] = new_src; 1409 } 1410 } 1411 } 1412 1413 inst->remove(); 1414 progress = true; 1415 } 1416 1417 if (progress) 1418 live_intervals_valid = false; 1419 1420 return progress; 1421} 1422 1423 1424bool 1425fs_visitor::compute_to_mrf() 1426{ 1427 bool progress = false; 1428 int next_ip = 0; 1429 1430 calculate_live_intervals(); 1431 1432 foreach_list_safe(node, &this->instructions) { 1433 fs_inst *inst = (fs_inst *)node; 1434 1435 int ip = next_ip; 1436 next_ip++; 1437 1438 if (inst->opcode != BRW_OPCODE_MOV || 1439 inst->predicated || 1440 inst->dst.file != MRF || inst->src[0].file != GRF || 1441 inst->dst.type != inst->src[0].type || 1442 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1) 1443 continue; 1444 1445 /* Work out which hardware MRF registers are written by this 1446 * instruction. 1447 */ 1448 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4; 1449 int mrf_high; 1450 if (inst->dst.reg & BRW_MRF_COMPR4) { 1451 mrf_high = mrf_low + 4; 1452 } else if (c->dispatch_width == 16 && 1453 (!inst->force_uncompressed && !inst->force_sechalf)) { 1454 mrf_high = mrf_low + 1; 1455 } else { 1456 mrf_high = mrf_low; 1457 } 1458 1459 /* Can't compute-to-MRF this GRF if someone else was going to 1460 * read it later. 1461 */ 1462 if (this->virtual_grf_use[inst->src[0].reg] > ip) 1463 continue; 1464 1465 /* Found a move of a GRF to a MRF. Let's see if we can go 1466 * rewrite the thing that made this GRF to write into the MRF. 1467 */ 1468 fs_inst *scan_inst; 1469 for (scan_inst = (fs_inst *)inst->prev; 1470 scan_inst->prev != NULL; 1471 scan_inst = (fs_inst *)scan_inst->prev) { 1472 if (scan_inst->dst.file == GRF && 1473 scan_inst->dst.reg == inst->src[0].reg) { 1474 /* Found the last thing to write our reg we want to turn 1475 * into a compute-to-MRF. 1476 */ 1477 1478 if (scan_inst->is_tex()) { 1479 /* texturing writes several continuous regs, so we can't 1480 * compute-to-mrf that. 1481 */ 1482 break; 1483 } 1484 1485 /* If it's predicated, it (probably) didn't populate all 1486 * the channels. We might be able to rewrite everything 1487 * that writes that reg, but it would require smarter 1488 * tracking to delay the rewriting until complete success. 1489 */ 1490 if (scan_inst->predicated) 1491 break; 1492 1493 /* If it's half of register setup and not the same half as 1494 * our MOV we're trying to remove, bail for now. 1495 */ 1496 if (scan_inst->force_uncompressed != inst->force_uncompressed || 1497 scan_inst->force_sechalf != inst->force_sechalf) { 1498 break; 1499 } 1500 1501 /* SEND instructions can't have MRF as a destination. */ 1502 if (scan_inst->mlen) 1503 break; 1504 1505 if (intel->gen >= 6) { 1506 /* gen6 math instructions must have the destination be 1507 * GRF, so no compute-to-MRF for them. 1508 */ 1509 if (scan_inst->is_math()) { 1510 break; 1511 } 1512 } 1513 1514 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 1515 /* Found the creator of our MRF's source value. */ 1516 scan_inst->dst.file = MRF; 1517 scan_inst->dst.reg = inst->dst.reg; 1518 scan_inst->saturate |= inst->saturate; 1519 inst->remove(); 1520 progress = true; 1521 } 1522 break; 1523 } 1524 1525 /* We don't handle flow control here. Most computation of 1526 * values that end up in MRFs are shortly before the MRF 1527 * write anyway. 1528 */ 1529 if (scan_inst->opcode == BRW_OPCODE_DO || 1530 scan_inst->opcode == BRW_OPCODE_WHILE || 1531 scan_inst->opcode == BRW_OPCODE_ELSE || 1532 scan_inst->opcode == BRW_OPCODE_ENDIF) { 1533 break; 1534 } 1535 1536 /* You can't read from an MRF, so if someone else reads our 1537 * MRF's source GRF that we wanted to rewrite, that stops us. 1538 */ 1539 bool interfered = false; 1540 for (int i = 0; i < 3; i++) { 1541 if (scan_inst->src[i].file == GRF && 1542 scan_inst->src[i].reg == inst->src[0].reg && 1543 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { 1544 interfered = true; 1545 } 1546 } 1547 if (interfered) 1548 break; 1549 1550 if (scan_inst->dst.file == MRF) { 1551 /* If somebody else writes our MRF here, we can't 1552 * compute-to-MRF before that. 1553 */ 1554 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4; 1555 int scan_mrf_high; 1556 1557 if (scan_inst->dst.reg & BRW_MRF_COMPR4) { 1558 scan_mrf_high = scan_mrf_low + 4; 1559 } else if (c->dispatch_width == 16 && 1560 (!scan_inst->force_uncompressed && 1561 !scan_inst->force_sechalf)) { 1562 scan_mrf_high = scan_mrf_low + 1; 1563 } else { 1564 scan_mrf_high = scan_mrf_low; 1565 } 1566 1567 if (mrf_low == scan_mrf_low || 1568 mrf_low == scan_mrf_high || 1569 mrf_high == scan_mrf_low || 1570 mrf_high == scan_mrf_high) { 1571 break; 1572 } 1573 } 1574 1575 if (scan_inst->mlen > 0) { 1576 /* Found a SEND instruction, which means that there are 1577 * live values in MRFs from base_mrf to base_mrf + 1578 * scan_inst->mlen - 1. Don't go pushing our MRF write up 1579 * above it. 1580 */ 1581 if (mrf_low >= scan_inst->base_mrf && 1582 mrf_low < scan_inst->base_mrf + scan_inst->mlen) { 1583 break; 1584 } 1585 if (mrf_high >= scan_inst->base_mrf && 1586 mrf_high < scan_inst->base_mrf + scan_inst->mlen) { 1587 break; 1588 } 1589 } 1590 } 1591 } 1592 1593 return progress; 1594} 1595 1596/** 1597 * Walks through basic blocks, locking for repeated MRF writes and 1598 * removing the later ones. 1599 */ 1600bool 1601fs_visitor::remove_duplicate_mrf_writes() 1602{ 1603 fs_inst *last_mrf_move[16]; 1604 bool progress = false; 1605 1606 /* Need to update the MRF tracking for compressed instructions. */ 1607 if (c->dispatch_width == 16) 1608 return false; 1609 1610 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 1611 1612 foreach_list_safe(node, &this->instructions) { 1613 fs_inst *inst = (fs_inst *)node; 1614 1615 switch (inst->opcode) { 1616 case BRW_OPCODE_DO: 1617 case BRW_OPCODE_WHILE: 1618 case BRW_OPCODE_IF: 1619 case BRW_OPCODE_ELSE: 1620 case BRW_OPCODE_ENDIF: 1621 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 1622 continue; 1623 default: 1624 break; 1625 } 1626 1627 if (inst->opcode == BRW_OPCODE_MOV && 1628 inst->dst.file == MRF) { 1629 fs_inst *prev_inst = last_mrf_move[inst->dst.reg]; 1630 if (prev_inst && inst->equals(prev_inst)) { 1631 inst->remove(); 1632 progress = true; 1633 continue; 1634 } 1635 } 1636 1637 /* Clear out the last-write records for MRFs that were overwritten. */ 1638 if (inst->dst.file == MRF) { 1639 last_mrf_move[inst->dst.reg] = NULL; 1640 } 1641 1642 if (inst->mlen > 0) { 1643 /* Found a SEND instruction, which will include two or fewer 1644 * implied MRF writes. We could do better here. 1645 */ 1646 for (int i = 0; i < implied_mrf_writes(inst); i++) { 1647 last_mrf_move[inst->base_mrf + i] = NULL; 1648 } 1649 } 1650 1651 /* Clear out any MRF move records whose sources got overwritten. */ 1652 if (inst->dst.file == GRF) { 1653 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) { 1654 if (last_mrf_move[i] && 1655 last_mrf_move[i]->src[0].reg == inst->dst.reg) { 1656 last_mrf_move[i] = NULL; 1657 } 1658 } 1659 } 1660 1661 if (inst->opcode == BRW_OPCODE_MOV && 1662 inst->dst.file == MRF && 1663 inst->src[0].file == GRF && 1664 !inst->predicated) { 1665 last_mrf_move[inst->dst.reg] = inst; 1666 } 1667 } 1668 1669 return progress; 1670} 1671 1672bool 1673fs_visitor::virtual_grf_interferes(int a, int b) 1674{ 1675 int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]); 1676 int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]); 1677 1678 /* We can't handle dead register writes here, without iterating 1679 * over the whole instruction stream to find every single dead 1680 * write to that register to compare to the live interval of the 1681 * other register. Just assert that dead_code_eliminate() has been 1682 * called. 1683 */ 1684 assert((this->virtual_grf_use[a] != -1 || 1685 this->virtual_grf_def[a] == MAX_INSTRUCTION) && 1686 (this->virtual_grf_use[b] != -1 || 1687 this->virtual_grf_def[b] == MAX_INSTRUCTION)); 1688 1689 /* If the register is used to store 16 values of less than float 1690 * size (only the case for pixel_[xy]), then we can't allocate 1691 * another dword-sized thing to that register that would be used in 1692 * the same instruction. This is because when the GPU decodes (for 1693 * example): 1694 * 1695 * (declare (in ) vec4 gl_FragCoord@0x97766a0) 1696 * add(16) g6<1>F g6<8,8,1>UW 0.5F { align1 compr }; 1697 * 1698 * it's actually processed as: 1699 * add(8) g6<1>F g6<8,8,1>UW 0.5F { align1 }; 1700 * add(8) g7<1>F g6.8<8,8,1>UW 0.5F { align1 sechalf }; 1701 * 1702 * so our second half values in g6 got overwritten in the first 1703 * half. 1704 */ 1705 if (c->dispatch_width == 16 && (this->pixel_x.reg == a || 1706 this->pixel_x.reg == b || 1707 this->pixel_y.reg == a || 1708 this->pixel_y.reg == b)) { 1709 return start <= end; 1710 } 1711 1712 return start < end; 1713} 1714 1715bool 1716fs_visitor::run() 1717{ 1718 uint32_t prog_offset_16 = 0; 1719 uint32_t orig_nr_params = c->prog_data.nr_params; 1720 1721 brw_wm_payload_setup(brw, c); 1722 1723 if (c->dispatch_width == 16) { 1724 /* align to 64 byte boundary. */ 1725 while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) { 1726 brw_NOP(p); 1727 } 1728 1729 /* Save off the start of this 16-wide program in case we succeed. */ 1730 prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction); 1731 1732 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 1733 } 1734 1735 if (0) { 1736 emit_dummy_fs(); 1737 } else { 1738 calculate_urb_setup(); 1739 if (intel->gen < 6) 1740 emit_interpolation_setup_gen4(); 1741 else 1742 emit_interpolation_setup_gen6(); 1743 1744 /* Generate FS IR for main(). (the visitor only descends into 1745 * functions called "main"). 1746 */ 1747 foreach_list(node, &*shader->ir) { 1748 ir_instruction *ir = (ir_instruction *)node; 1749 base_ir = ir; 1750 this->result = reg_undef; 1751 ir->accept(this); 1752 } 1753 if (failed) 1754 return false; 1755 1756 emit_fb_writes(); 1757 1758 split_virtual_grfs(); 1759 1760 setup_paramvalues_refs(); 1761 setup_pull_constants(); 1762 1763 bool progress; 1764 do { 1765 progress = false; 1766 1767 progress = remove_duplicate_mrf_writes() || progress; 1768 1769 progress = propagate_constants() || progress; 1770 progress = opt_algebraic() || progress; 1771 progress = register_coalesce() || progress; 1772 progress = compute_to_mrf() || progress; 1773 progress = dead_code_eliminate() || progress; 1774 } while (progress); 1775 1776 remove_dead_constants(); 1777 1778 schedule_instructions(); 1779 1780 assign_curb_setup(); 1781 assign_urb_setup(); 1782 1783 if (0) { 1784 /* Debug of register spilling: Go spill everything. */ 1785 int virtual_grf_count = virtual_grf_next; 1786 for (int i = 0; i < virtual_grf_count; i++) { 1787 spill_reg(i); 1788 } 1789 } 1790 1791 if (0) 1792 assign_regs_trivial(); 1793 else { 1794 while (!assign_regs()) { 1795 if (failed) 1796 break; 1797 } 1798 } 1799 } 1800 assert(force_uncompressed_stack == 0); 1801 assert(force_sechalf_stack == 0); 1802 1803 if (failed) 1804 return false; 1805 1806 generate_code(); 1807 1808 if (c->dispatch_width == 8) { 1809 c->prog_data.reg_blocks = brw_register_blocks(grf_used); 1810 } else { 1811 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used); 1812 c->prog_data.prog_offset_16 = prog_offset_16; 1813 1814 /* Make sure we didn't try to sneak in an extra uniform */ 1815 assert(orig_nr_params == c->prog_data.nr_params); 1816 (void) orig_nr_params; 1817 } 1818 1819 return !failed; 1820} 1821 1822bool 1823brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c, 1824 struct gl_shader_program *prog) 1825{ 1826 struct intel_context *intel = &brw->intel; 1827 1828 if (!prog) 1829 return false; 1830 1831 struct brw_shader *shader = 1832 (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; 1833 if (!shader) 1834 return false; 1835 1836 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 1837 printf("GLSL IR for native fragment shader %d:\n", prog->Name); 1838 _mesa_print_ir(shader->ir, NULL); 1839 printf("\n\n"); 1840 } 1841 1842 /* Now the main event: Visit the shader IR and generate our FS IR for it. 1843 */ 1844 c->dispatch_width = 8; 1845 1846 fs_visitor v(c, prog, shader); 1847 if (!v.run()) { 1848 prog->LinkStatus = false; 1849 ralloc_strcat(&prog->InfoLog, v.fail_msg); 1850 1851 return false; 1852 } 1853 1854 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) { 1855 c->dispatch_width = 16; 1856 fs_visitor v2(c, prog, shader); 1857 v2.import_uniforms(&v); 1858 v2.run(); 1859 } 1860 1861 c->prog_data.dispatch_width = 8; 1862 1863 return true; 1864} 1865 1866bool 1867brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog) 1868{ 1869 struct brw_context *brw = brw_context(ctx); 1870 struct brw_wm_prog_key key; 1871 1872 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT]) 1873 return true; 1874 1875 struct gl_fragment_program *fp = (struct gl_fragment_program *) 1876 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program; 1877 struct brw_fragment_program *bfp = brw_fragment_program(fp); 1878 1879 memset(&key, 0, sizeof(key)); 1880 1881 if (fp->UsesKill) 1882 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT; 1883 1884 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) 1885 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT; 1886 1887 /* Just assume depth testing. */ 1888 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT; 1889 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT; 1890 1891 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS); 1892 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) { 1893 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i))) 1894 continue; 1895 1896 key.proj_attrib_mask |= 1 << i; 1897 1898 int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i); 1899 1900 if (vp_index >= 0) 1901 key.vp_outputs_written |= BITFIELD64_BIT(vp_index); 1902 } 1903 1904 key.clamp_fragment_color = true; 1905 1906 for (int i = 0; i < BRW_MAX_TEX_UNIT; i++) { 1907 if (fp->Base.ShadowSamplers & (1 << i)) 1908 key.tex.compare_funcs[i] = GL_LESS; 1909 1910 /* FINISHME: depth compares might use (0,0,0,W) for example */ 1911 key.tex.swizzles[i] = SWIZZLE_XYZW; 1912 } 1913 1914 if (fp->Base.InputsRead & FRAG_BIT_WPOS) { 1915 key.drawable_height = ctx->DrawBuffer->Height; 1916 key.render_to_fbo = ctx->DrawBuffer->Name != 0; 1917 } 1918 1919 key.nr_color_regions = 1; 1920 1921 key.program_string_id = bfp->id; 1922 1923 uint32_t old_prog_offset = brw->wm.prog_offset; 1924 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data; 1925 1926 bool success = do_wm_prog(brw, prog, bfp, &key); 1927 1928 brw->wm.prog_offset = old_prog_offset; 1929 brw->wm.prog_data = old_prog_data; 1930 1931 return success; 1932} 1933