brw_fs.cpp revision cf0e7aa9f8bc9c175ebd9b2ab3a8bfec4afc5abf
1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24/** @file brw_fs.cpp 25 * 26 * This file drives the GLSL IR -> LIR translation, contains the 27 * optimizations on the LIR, and drives the generation of native code 28 * from the LIR. 29 */ 30 31extern "C" { 32 33#include <sys/types.h> 34 35#include "main/macros.h" 36#include "main/shaderobj.h" 37#include "main/uniforms.h" 38#include "main/fbobject.h" 39#include "program/prog_parameter.h" 40#include "program/prog_print.h" 41#include "program/register_allocate.h" 42#include "program/sampler.h" 43#include "program/hash_table.h" 44#include "brw_context.h" 45#include "brw_eu.h" 46#include "brw_wm.h" 47} 48#include "brw_shader.h" 49#include "brw_fs.h" 50#include "glsl/glsl_types.h" 51#include "glsl/ir_print_visitor.h" 52 53int 54fs_visitor::type_size(const struct glsl_type *type) 55{ 56 unsigned int size, i; 57 58 switch (type->base_type) { 59 case GLSL_TYPE_UINT: 60 case GLSL_TYPE_INT: 61 case GLSL_TYPE_FLOAT: 62 case GLSL_TYPE_BOOL: 63 return type->components(); 64 case GLSL_TYPE_ARRAY: 65 return type_size(type->fields.array) * type->length; 66 case GLSL_TYPE_STRUCT: 67 size = 0; 68 for (i = 0; i < type->length; i++) { 69 size += type_size(type->fields.structure[i].type); 70 } 71 return size; 72 case GLSL_TYPE_SAMPLER: 73 /* Samplers take up no register space, since they're baked in at 74 * link time. 75 */ 76 return 0; 77 default: 78 assert(!"not reached"); 79 return 0; 80 } 81} 82 83void 84fs_visitor::fail(const char *format, ...) 85{ 86 va_list va; 87 char *msg; 88 89 if (failed) 90 return; 91 92 failed = true; 93 94 va_start(va, format); 95 msg = ralloc_vasprintf(mem_ctx, format, va); 96 va_end(va); 97 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg); 98 99 this->fail_msg = msg; 100 101 if (INTEL_DEBUG & DEBUG_WM) { 102 fprintf(stderr, "%s", msg); 103 } 104} 105 106void 107fs_visitor::push_force_uncompressed() 108{ 109 force_uncompressed_stack++; 110} 111 112void 113fs_visitor::pop_force_uncompressed() 114{ 115 force_uncompressed_stack--; 116 assert(force_uncompressed_stack >= 0); 117} 118 119void 120fs_visitor::push_force_sechalf() 121{ 122 force_sechalf_stack++; 123} 124 125void 126fs_visitor::pop_force_sechalf() 127{ 128 force_sechalf_stack--; 129 assert(force_sechalf_stack >= 0); 130} 131 132/** 133 * Returns how many MRFs an FS opcode will write over. 134 * 135 * Note that this is not the 0 or 1 implied writes in an actual gen 136 * instruction -- the FS opcodes often generate MOVs in addition. 137 */ 138int 139fs_visitor::implied_mrf_writes(fs_inst *inst) 140{ 141 if (inst->mlen == 0) 142 return 0; 143 144 switch (inst->opcode) { 145 case SHADER_OPCODE_RCP: 146 case SHADER_OPCODE_RSQ: 147 case SHADER_OPCODE_SQRT: 148 case SHADER_OPCODE_EXP2: 149 case SHADER_OPCODE_LOG2: 150 case SHADER_OPCODE_SIN: 151 case SHADER_OPCODE_COS: 152 return 1 * c->dispatch_width / 8; 153 case SHADER_OPCODE_POW: 154 case SHADER_OPCODE_INT_QUOTIENT: 155 case SHADER_OPCODE_INT_REMAINDER: 156 return 2 * c->dispatch_width / 8; 157 case SHADER_OPCODE_TEX: 158 case FS_OPCODE_TXB: 159 case SHADER_OPCODE_TXD: 160 case SHADER_OPCODE_TXF: 161 case SHADER_OPCODE_TXL: 162 case SHADER_OPCODE_TXS: 163 return 1; 164 case FS_OPCODE_FB_WRITE: 165 return 2; 166 case FS_OPCODE_PULL_CONSTANT_LOAD: 167 case FS_OPCODE_UNSPILL: 168 return 1; 169 case FS_OPCODE_SPILL: 170 return 2; 171 default: 172 assert(!"not reached"); 173 return inst->mlen; 174 } 175} 176 177int 178fs_visitor::virtual_grf_alloc(int size) 179{ 180 if (virtual_grf_array_size <= virtual_grf_next) { 181 if (virtual_grf_array_size == 0) 182 virtual_grf_array_size = 16; 183 else 184 virtual_grf_array_size *= 2; 185 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int, 186 virtual_grf_array_size); 187 } 188 virtual_grf_sizes[virtual_grf_next] = size; 189 return virtual_grf_next++; 190} 191 192/** Fixed HW reg constructor. */ 193fs_reg::fs_reg(enum register_file file, int reg) 194{ 195 init(); 196 this->file = file; 197 this->reg = reg; 198 this->type = BRW_REGISTER_TYPE_F; 199} 200 201/** Fixed HW reg constructor. */ 202fs_reg::fs_reg(enum register_file file, int reg, uint32_t type) 203{ 204 init(); 205 this->file = file; 206 this->reg = reg; 207 this->type = type; 208} 209 210/** Automatic reg constructor. */ 211fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type) 212{ 213 init(); 214 215 this->file = GRF; 216 this->reg = v->virtual_grf_alloc(v->type_size(type)); 217 this->reg_offset = 0; 218 this->type = brw_type_for_base_type(type); 219} 220 221fs_reg * 222fs_visitor::variable_storage(ir_variable *var) 223{ 224 return (fs_reg *)hash_table_find(this->variable_ht, var); 225} 226 227void 228import_uniforms_callback(const void *key, 229 void *data, 230 void *closure) 231{ 232 struct hash_table *dst_ht = (struct hash_table *)closure; 233 const fs_reg *reg = (const fs_reg *)data; 234 235 if (reg->file != UNIFORM) 236 return; 237 238 hash_table_insert(dst_ht, data, key); 239} 240 241/* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch. 242 * This brings in those uniform definitions 243 */ 244void 245fs_visitor::import_uniforms(fs_visitor *v) 246{ 247 hash_table_call_foreach(v->variable_ht, 248 import_uniforms_callback, 249 variable_ht); 250 this->params_remap = v->params_remap; 251} 252 253/* Our support for uniforms is piggy-backed on the struct 254 * gl_fragment_program, because that's where the values actually 255 * get stored, rather than in some global gl_shader_program uniform 256 * store. 257 */ 258int 259fs_visitor::setup_uniform_values(int loc, const glsl_type *type) 260{ 261 unsigned int offset = 0; 262 263 if (type->is_matrix()) { 264 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT, 265 type->vector_elements, 266 1); 267 268 for (unsigned int i = 0; i < type->matrix_columns; i++) { 269 offset += setup_uniform_values(loc + offset, column); 270 } 271 272 return offset; 273 } 274 275 switch (type->base_type) { 276 case GLSL_TYPE_FLOAT: 277 case GLSL_TYPE_UINT: 278 case GLSL_TYPE_INT: 279 case GLSL_TYPE_BOOL: 280 for (unsigned int i = 0; i < type->vector_elements; i++) { 281 unsigned int param = c->prog_data.nr_params++; 282 283 assert(param < ARRAY_SIZE(c->prog_data.param)); 284 285 if (ctx->Const.NativeIntegers) { 286 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 287 } else { 288 switch (type->base_type) { 289 case GLSL_TYPE_FLOAT: 290 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 291 break; 292 case GLSL_TYPE_UINT: 293 c->prog_data.param_convert[param] = PARAM_CONVERT_F2U; 294 break; 295 case GLSL_TYPE_INT: 296 c->prog_data.param_convert[param] = PARAM_CONVERT_F2I; 297 break; 298 case GLSL_TYPE_BOOL: 299 c->prog_data.param_convert[param] = PARAM_CONVERT_F2B; 300 break; 301 default: 302 assert(!"not reached"); 303 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 304 break; 305 } 306 } 307 this->param_index[param] = loc; 308 this->param_offset[param] = i; 309 } 310 return 1; 311 312 case GLSL_TYPE_STRUCT: 313 for (unsigned int i = 0; i < type->length; i++) { 314 offset += setup_uniform_values(loc + offset, 315 type->fields.structure[i].type); 316 } 317 return offset; 318 319 case GLSL_TYPE_ARRAY: 320 for (unsigned int i = 0; i < type->length; i++) { 321 offset += setup_uniform_values(loc + offset, type->fields.array); 322 } 323 return offset; 324 325 case GLSL_TYPE_SAMPLER: 326 /* The sampler takes up a slot, but we don't use any values from it. */ 327 return 1; 328 329 default: 330 assert(!"not reached"); 331 return 0; 332 } 333} 334 335 336/* Our support for builtin uniforms is even scarier than non-builtin. 337 * It sits on top of the PROG_STATE_VAR parameters that are 338 * automatically updated from GL context state. 339 */ 340void 341fs_visitor::setup_builtin_uniform_values(ir_variable *ir) 342{ 343 const ir_state_slot *const slots = ir->state_slots; 344 assert(ir->state_slots != NULL); 345 346 for (unsigned int i = 0; i < ir->num_state_slots; i++) { 347 /* This state reference has already been setup by ir_to_mesa, but we'll 348 * get the same index back here. 349 */ 350 int index = _mesa_add_state_reference(this->fp->Base.Parameters, 351 (gl_state_index *)slots[i].tokens); 352 353 /* Add each of the unique swizzles of the element as a parameter. 354 * This'll end up matching the expected layout of the 355 * array/matrix/structure we're trying to fill in. 356 */ 357 int last_swiz = -1; 358 for (unsigned int j = 0; j < 4; j++) { 359 int swiz = GET_SWZ(slots[i].swizzle, j); 360 if (swiz == last_swiz) 361 break; 362 last_swiz = swiz; 363 364 c->prog_data.param_convert[c->prog_data.nr_params] = 365 PARAM_NO_CONVERT; 366 this->param_index[c->prog_data.nr_params] = index; 367 this->param_offset[c->prog_data.nr_params] = swiz; 368 c->prog_data.nr_params++; 369 } 370 } 371} 372 373fs_reg * 374fs_visitor::emit_fragcoord_interpolation(ir_variable *ir) 375{ 376 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 377 fs_reg wpos = *reg; 378 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo; 379 380 /* gl_FragCoord.x */ 381 if (ir->pixel_center_integer) { 382 emit(BRW_OPCODE_MOV, wpos, this->pixel_x); 383 } else { 384 emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f)); 385 } 386 wpos.reg_offset++; 387 388 /* gl_FragCoord.y */ 389 if (!flip && ir->pixel_center_integer) { 390 emit(BRW_OPCODE_MOV, wpos, this->pixel_y); 391 } else { 392 fs_reg pixel_y = this->pixel_y; 393 float offset = (ir->pixel_center_integer ? 0.0 : 0.5); 394 395 if (flip) { 396 pixel_y.negate = true; 397 offset += c->key.drawable_height - 1.0; 398 } 399 400 emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset)); 401 } 402 wpos.reg_offset++; 403 404 /* gl_FragCoord.z */ 405 if (intel->gen >= 6) { 406 emit(BRW_OPCODE_MOV, wpos, 407 fs_reg(brw_vec8_grf(c->source_depth_reg, 0))); 408 } else { 409 emit(FS_OPCODE_LINTERP, wpos, 410 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], 411 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], 412 interp_reg(FRAG_ATTRIB_WPOS, 2)); 413 } 414 wpos.reg_offset++; 415 416 /* gl_FragCoord.w: Already set up in emit_interpolation */ 417 emit(BRW_OPCODE_MOV, wpos, this->wpos_w); 418 419 return reg; 420} 421 422fs_inst * 423fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp, 424 glsl_interp_qualifier interpolation_mode) 425{ 426 brw_wm_barycentric_interp_mode barycoord_mode; 427 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH) 428 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC; 429 else 430 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC; 431 return emit(FS_OPCODE_LINTERP, attr, 432 this->delta_x[barycoord_mode], 433 this->delta_y[barycoord_mode], interp); 434} 435 436fs_reg * 437fs_visitor::emit_general_interpolation(ir_variable *ir) 438{ 439 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 440 reg->type = brw_type_for_base_type(ir->type->get_scalar_type()); 441 fs_reg attr = *reg; 442 443 unsigned int array_elements; 444 const glsl_type *type; 445 446 if (ir->type->is_array()) { 447 array_elements = ir->type->length; 448 if (array_elements == 0) { 449 fail("dereferenced array '%s' has length 0\n", ir->name); 450 } 451 type = ir->type->fields.array; 452 } else { 453 array_elements = 1; 454 type = ir->type; 455 } 456 457 glsl_interp_qualifier interpolation_mode = 458 ir->determine_interpolation_mode(c->key.flat_shade); 459 460 int location = ir->location; 461 for (unsigned int i = 0; i < array_elements; i++) { 462 for (unsigned int j = 0; j < type->matrix_columns; j++) { 463 if (urb_setup[location] == -1) { 464 /* If there's no incoming setup data for this slot, don't 465 * emit interpolation for it. 466 */ 467 attr.reg_offset += type->vector_elements; 468 location++; 469 continue; 470 } 471 472 if (interpolation_mode == INTERP_QUALIFIER_FLAT) { 473 /* Constant interpolation (flat shading) case. The SF has 474 * handed us defined values in only the constant offset 475 * field of the setup reg. 476 */ 477 for (unsigned int k = 0; k < type->vector_elements; k++) { 478 struct brw_reg interp = interp_reg(location, k); 479 interp = suboffset(interp, 3); 480 interp.type = reg->type; 481 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp)); 482 attr.reg_offset++; 483 } 484 } else { 485 /* Smooth/noperspective interpolation case. */ 486 for (unsigned int k = 0; k < type->vector_elements; k++) { 487 /* FINISHME: At some point we probably want to push 488 * this farther by giving similar treatment to the 489 * other potentially constant components of the 490 * attribute, as well as making brw_vs_constval.c 491 * handle varyings other than gl_TexCoord. 492 */ 493 if (location >= FRAG_ATTRIB_TEX0 && 494 location <= FRAG_ATTRIB_TEX7 && 495 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) { 496 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f)); 497 } else { 498 struct brw_reg interp = interp_reg(location, k); 499 emit_linterp(attr, fs_reg(interp), interpolation_mode); 500 if (intel->gen < 6) { 501 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w); 502 } 503 } 504 attr.reg_offset++; 505 } 506 507 } 508 location++; 509 } 510 } 511 512 return reg; 513} 514 515fs_reg * 516fs_visitor::emit_frontfacing_interpolation(ir_variable *ir) 517{ 518 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 519 520 /* The frontfacing comes in as a bit in the thread payload. */ 521 if (intel->gen >= 6) { 522 emit(BRW_OPCODE_ASR, *reg, 523 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)), 524 fs_reg(15)); 525 emit(BRW_OPCODE_NOT, *reg, *reg); 526 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1)); 527 } else { 528 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); 529 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives 530 * us front face 531 */ 532 fs_inst *inst = emit(BRW_OPCODE_CMP, *reg, 533 fs_reg(r1_6ud), 534 fs_reg(1u << 31)); 535 inst->conditional_mod = BRW_CONDITIONAL_L; 536 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u)); 537 } 538 539 return reg; 540} 541 542fs_inst * 543fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src) 544{ 545 switch (opcode) { 546 case SHADER_OPCODE_RCP: 547 case SHADER_OPCODE_RSQ: 548 case SHADER_OPCODE_SQRT: 549 case SHADER_OPCODE_EXP2: 550 case SHADER_OPCODE_LOG2: 551 case SHADER_OPCODE_SIN: 552 case SHADER_OPCODE_COS: 553 break; 554 default: 555 assert(!"not reached: bad math opcode"); 556 return NULL; 557 } 558 559 /* Can't do hstride == 0 args to gen6 math, so expand it out. We 560 * might be able to do better by doing execsize = 1 math and then 561 * expanding that result out, but we would need to be careful with 562 * masking. 563 * 564 * Gen 6 hardware ignores source modifiers (negate and abs) on math 565 * instructions, so we also move to a temp to set those up. 566 */ 567 if (intel->gen == 6 && (src.file == UNIFORM || 568 src.abs || 569 src.negate)) { 570 fs_reg expanded = fs_reg(this, glsl_type::float_type); 571 emit(BRW_OPCODE_MOV, expanded, src); 572 src = expanded; 573 } 574 575 fs_inst *inst = emit(opcode, dst, src); 576 577 if (intel->gen < 6) { 578 inst->base_mrf = 2; 579 inst->mlen = c->dispatch_width / 8; 580 } 581 582 return inst; 583} 584 585fs_inst * 586fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1) 587{ 588 int base_mrf = 2; 589 fs_inst *inst; 590 591 switch (opcode) { 592 case SHADER_OPCODE_POW: 593 case SHADER_OPCODE_INT_QUOTIENT: 594 case SHADER_OPCODE_INT_REMAINDER: 595 break; 596 default: 597 assert(!"not reached: unsupported binary math opcode."); 598 return NULL; 599 } 600 601 if (intel->gen >= 7) { 602 inst = emit(opcode, dst, src0, src1); 603 } else if (intel->gen == 6) { 604 /* Can't do hstride == 0 args to gen6 math, so expand it out. 605 * 606 * The hardware ignores source modifiers (negate and abs) on math 607 * instructions, so we also move to a temp to set those up. 608 */ 609 if (src0.file == UNIFORM || src0.abs || src0.negate) { 610 fs_reg expanded = fs_reg(this, glsl_type::float_type); 611 expanded.type = src0.type; 612 emit(BRW_OPCODE_MOV, expanded, src0); 613 src0 = expanded; 614 } 615 616 if (src1.file == UNIFORM || src1.abs || src1.negate) { 617 fs_reg expanded = fs_reg(this, glsl_type::float_type); 618 expanded.type = src1.type; 619 emit(BRW_OPCODE_MOV, expanded, src1); 620 src1 = expanded; 621 } 622 623 inst = emit(opcode, dst, src0, src1); 624 } else { 625 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13 626 * "Message Payload": 627 * 628 * "Operand0[7]. For the INT DIV functions, this operand is the 629 * denominator." 630 * ... 631 * "Operand1[7]. For the INT DIV functions, this operand is the 632 * numerator." 633 */ 634 bool is_int_div = opcode != SHADER_OPCODE_POW; 635 fs_reg &op0 = is_int_div ? src1 : src0; 636 fs_reg &op1 = is_int_div ? src0 : src1; 637 638 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1); 639 inst = emit(opcode, dst, op0, reg_null_f); 640 641 inst->base_mrf = base_mrf; 642 inst->mlen = 2 * c->dispatch_width / 8; 643 } 644 return inst; 645} 646 647/** 648 * To be called after the last _mesa_add_state_reference() call, to 649 * set up prog_data.param[] for assign_curb_setup() and 650 * setup_pull_constants(). 651 */ 652void 653fs_visitor::setup_paramvalues_refs() 654{ 655 if (c->dispatch_width != 8) 656 return; 657 658 /* Set up the pointers to ParamValues now that that array is finalized. */ 659 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 660 c->prog_data.param[i] = 661 (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] + 662 this->param_offset[i]; 663 } 664} 665 666void 667fs_visitor::assign_curb_setup() 668{ 669 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8; 670 if (c->dispatch_width == 8) { 671 c->prog_data.first_curbe_grf = c->nr_payload_regs; 672 } else { 673 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs; 674 } 675 676 /* Map the offsets in the UNIFORM file to fixed HW regs. */ 677 foreach_list(node, &this->instructions) { 678 fs_inst *inst = (fs_inst *)node; 679 680 for (unsigned int i = 0; i < 3; i++) { 681 if (inst->src[i].file == UNIFORM) { 682 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset; 683 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs + 684 constant_nr / 8, 685 constant_nr % 8); 686 687 inst->src[i].file = FIXED_HW_REG; 688 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type); 689 } 690 } 691 } 692} 693 694void 695fs_visitor::calculate_urb_setup() 696{ 697 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 698 urb_setup[i] = -1; 699 } 700 701 int urb_next = 0; 702 /* Figure out where each of the incoming setup attributes lands. */ 703 if (intel->gen >= 6) { 704 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 705 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) { 706 urb_setup[i] = urb_next++; 707 } 708 } 709 } else { 710 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */ 711 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) { 712 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) { 713 int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i); 714 715 if (fp_index >= 0) 716 urb_setup[fp_index] = urb_next++; 717 } 718 } 719 720 /* 721 * It's a FS only attribute, and we did interpolation for this attribute 722 * in SF thread. So, count it here, too. 723 * 724 * See compile_sf_prog() for more info. 725 */ 726 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC)) 727 urb_setup[FRAG_ATTRIB_PNTC] = urb_next++; 728 } 729 730 /* Each attribute is 4 setup channels, each of which is half a reg. */ 731 c->prog_data.urb_read_length = urb_next * 2; 732} 733 734void 735fs_visitor::assign_urb_setup() 736{ 737 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length; 738 739 /* Offset all the urb_setup[] index by the actual position of the 740 * setup regs, now that the location of the constants has been chosen. 741 */ 742 foreach_list(node, &this->instructions) { 743 fs_inst *inst = (fs_inst *)node; 744 745 if (inst->opcode == FS_OPCODE_LINTERP) { 746 assert(inst->src[2].file == FIXED_HW_REG); 747 inst->src[2].fixed_hw_reg.nr += urb_start; 748 } 749 750 if (inst->opcode == FS_OPCODE_CINTERP) { 751 assert(inst->src[0].file == FIXED_HW_REG); 752 inst->src[0].fixed_hw_reg.nr += urb_start; 753 } 754 } 755 756 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length; 757} 758 759/** 760 * Split large virtual GRFs into separate components if we can. 761 * 762 * This is mostly duplicated with what brw_fs_vector_splitting does, 763 * but that's really conservative because it's afraid of doing 764 * splitting that doesn't result in real progress after the rest of 765 * the optimization phases, which would cause infinite looping in 766 * optimization. We can do it once here, safely. This also has the 767 * opportunity to split interpolated values, or maybe even uniforms, 768 * which we don't have at the IR level. 769 * 770 * We want to split, because virtual GRFs are what we register 771 * allocate and spill (due to contiguousness requirements for some 772 * instructions), and they're what we naturally generate in the 773 * codegen process, but most virtual GRFs don't actually need to be 774 * contiguous sets of GRFs. If we split, we'll end up with reduced 775 * live intervals and better dead code elimination and coalescing. 776 */ 777void 778fs_visitor::split_virtual_grfs() 779{ 780 int num_vars = this->virtual_grf_next; 781 bool split_grf[num_vars]; 782 int new_virtual_grf[num_vars]; 783 784 /* Try to split anything > 0 sized. */ 785 for (int i = 0; i < num_vars; i++) { 786 if (this->virtual_grf_sizes[i] != 1) 787 split_grf[i] = true; 788 else 789 split_grf[i] = false; 790 } 791 792 if (brw->has_pln && 793 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) { 794 /* PLN opcodes rely on the delta_xy being contiguous. We only have to 795 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to 796 * Gen6, that was the only supported interpolation mode, and since Gen6, 797 * delta_x and delta_y are in fixed hardware registers. 798 */ 799 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] = 800 false; 801 } 802 803 foreach_list(node, &this->instructions) { 804 fs_inst *inst = (fs_inst *)node; 805 806 /* Texturing produces 4 contiguous registers, so no splitting. */ 807 if (inst->is_tex()) { 808 split_grf[inst->dst.reg] = false; 809 } 810 } 811 812 /* Allocate new space for split regs. Note that the virtual 813 * numbers will be contiguous. 814 */ 815 for (int i = 0; i < num_vars; i++) { 816 if (split_grf[i]) { 817 new_virtual_grf[i] = virtual_grf_alloc(1); 818 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) { 819 int reg = virtual_grf_alloc(1); 820 assert(reg == new_virtual_grf[i] + j - 1); 821 (void) reg; 822 } 823 this->virtual_grf_sizes[i] = 1; 824 } 825 } 826 827 foreach_list(node, &this->instructions) { 828 fs_inst *inst = (fs_inst *)node; 829 830 if (inst->dst.file == GRF && 831 split_grf[inst->dst.reg] && 832 inst->dst.reg_offset != 0) { 833 inst->dst.reg = (new_virtual_grf[inst->dst.reg] + 834 inst->dst.reg_offset - 1); 835 inst->dst.reg_offset = 0; 836 } 837 for (int i = 0; i < 3; i++) { 838 if (inst->src[i].file == GRF && 839 split_grf[inst->src[i].reg] && 840 inst->src[i].reg_offset != 0) { 841 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] + 842 inst->src[i].reg_offset - 1); 843 inst->src[i].reg_offset = 0; 844 } 845 } 846 } 847 this->live_intervals_valid = false; 848} 849 850bool 851fs_visitor::remove_dead_constants() 852{ 853 if (c->dispatch_width == 8) { 854 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params); 855 856 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) 857 this->params_remap[i] = -1; 858 859 /* Find which params are still in use. */ 860 foreach_list(node, &this->instructions) { 861 fs_inst *inst = (fs_inst *)node; 862 863 for (int i = 0; i < 3; i++) { 864 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset; 865 866 if (inst->src[i].file != UNIFORM) 867 continue; 868 869 assert(constant_nr < (int)c->prog_data.nr_params); 870 871 /* For now, set this to non-negative. We'll give it the 872 * actual new number in a moment, in order to keep the 873 * register numbers nicely ordered. 874 */ 875 this->params_remap[constant_nr] = 0; 876 } 877 } 878 879 /* Figure out what the new numbers for the params will be. At some 880 * point when we're doing uniform array access, we're going to want 881 * to keep the distinction between .reg and .reg_offset, but for 882 * now we don't care. 883 */ 884 unsigned int new_nr_params = 0; 885 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 886 if (this->params_remap[i] != -1) { 887 this->params_remap[i] = new_nr_params++; 888 } 889 } 890 891 /* Update the list of params to be uploaded to match our new numbering. */ 892 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 893 int remapped = this->params_remap[i]; 894 895 if (remapped == -1) 896 continue; 897 898 /* We've already done setup_paramvalues_refs() so no need to worry 899 * about param_index and param_offset. 900 */ 901 c->prog_data.param[remapped] = c->prog_data.param[i]; 902 c->prog_data.param_convert[remapped] = c->prog_data.param_convert[i]; 903 } 904 905 c->prog_data.nr_params = new_nr_params; 906 } else { 907 /* This should have been generated in the 8-wide pass already. */ 908 assert(this->params_remap); 909 } 910 911 /* Now do the renumbering of the shader to remove unused params. */ 912 foreach_list(node, &this->instructions) { 913 fs_inst *inst = (fs_inst *)node; 914 915 for (int i = 0; i < 3; i++) { 916 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset; 917 918 if (inst->src[i].file != UNIFORM) 919 continue; 920 921 assert(this->params_remap[constant_nr] != -1); 922 inst->src[i].reg = this->params_remap[constant_nr]; 923 inst->src[i].reg_offset = 0; 924 } 925 } 926 927 return true; 928} 929 930/** 931 * Choose accesses from the UNIFORM file to demote to using the pull 932 * constant buffer. 933 * 934 * We allow a fragment shader to have more than the specified minimum 935 * maximum number of fragment shader uniform components (64). If 936 * there are too many of these, they'd fill up all of register space. 937 * So, this will push some of them out to the pull constant buffer and 938 * update the program to load them. 939 */ 940void 941fs_visitor::setup_pull_constants() 942{ 943 /* Only allow 16 registers (128 uniform components) as push constants. */ 944 unsigned int max_uniform_components = 16 * 8; 945 if (c->prog_data.nr_params <= max_uniform_components) 946 return; 947 948 if (c->dispatch_width == 16) { 949 fail("Pull constants not supported in 16-wide\n"); 950 return; 951 } 952 953 /* Just demote the end of the list. We could probably do better 954 * here, demoting things that are rarely used in the program first. 955 */ 956 int pull_uniform_base = max_uniform_components; 957 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base; 958 959 foreach_list(node, &this->instructions) { 960 fs_inst *inst = (fs_inst *)node; 961 962 for (int i = 0; i < 3; i++) { 963 if (inst->src[i].file != UNIFORM) 964 continue; 965 966 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset; 967 if (uniform_nr < pull_uniform_base) 968 continue; 969 970 fs_reg dst = fs_reg(this, glsl_type::float_type); 971 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD, 972 dst); 973 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15; 974 pull->ir = inst->ir; 975 pull->annotation = inst->annotation; 976 pull->base_mrf = 14; 977 pull->mlen = 1; 978 979 inst->insert_before(pull); 980 981 inst->src[i].file = GRF; 982 inst->src[i].reg = dst.reg; 983 inst->src[i].reg_offset = 0; 984 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3; 985 } 986 } 987 988 for (int i = 0; i < pull_uniform_count; i++) { 989 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i]; 990 c->prog_data.pull_param_convert[i] = 991 c->prog_data.param_convert[pull_uniform_base + i]; 992 } 993 c->prog_data.nr_params -= pull_uniform_count; 994 c->prog_data.nr_pull_params = pull_uniform_count; 995} 996 997/** 998 * Attempts to move immediate constants into the immediate 999 * constant slot of following instructions. 1000 * 1001 * Immediate constants are a bit tricky -- they have to be in the last 1002 * operand slot, you can't do abs/negate on them, 1003 */ 1004 1005bool 1006fs_visitor::propagate_constants() 1007{ 1008 bool progress = false; 1009 1010 calculate_live_intervals(); 1011 1012 foreach_list(node, &this->instructions) { 1013 fs_inst *inst = (fs_inst *)node; 1014 1015 if (inst->opcode != BRW_OPCODE_MOV || 1016 inst->predicated || 1017 inst->dst.file != GRF || inst->src[0].file != IMM || 1018 inst->dst.type != inst->src[0].type || 1019 (c->dispatch_width == 16 && 1020 (inst->force_uncompressed || inst->force_sechalf))) 1021 continue; 1022 1023 /* Don't bother with cases where we should have had the 1024 * operation on the constant folded in GLSL already. 1025 */ 1026 if (inst->saturate) 1027 continue; 1028 1029 /* Found a move of a constant to a GRF. Find anything else using the GRF 1030 * before it's written, and replace it with the constant if we can. 1031 */ 1032 for (fs_inst *scan_inst = (fs_inst *)inst->next; 1033 !scan_inst->is_tail_sentinel(); 1034 scan_inst = (fs_inst *)scan_inst->next) { 1035 if (scan_inst->opcode == BRW_OPCODE_DO || 1036 scan_inst->opcode == BRW_OPCODE_WHILE || 1037 scan_inst->opcode == BRW_OPCODE_ELSE || 1038 scan_inst->opcode == BRW_OPCODE_ENDIF) { 1039 break; 1040 } 1041 1042 for (int i = 2; i >= 0; i--) { 1043 if (scan_inst->src[i].file != GRF || 1044 scan_inst->src[i].reg != inst->dst.reg || 1045 scan_inst->src[i].reg_offset != inst->dst.reg_offset) 1046 continue; 1047 1048 /* Don't bother with cases where we should have had the 1049 * operation on the constant folded in GLSL already. 1050 */ 1051 if (scan_inst->src[i].negate || scan_inst->src[i].abs) 1052 continue; 1053 1054 switch (scan_inst->opcode) { 1055 case BRW_OPCODE_MOV: 1056 scan_inst->src[i] = inst->src[0]; 1057 progress = true; 1058 break; 1059 1060 case BRW_OPCODE_MUL: 1061 case BRW_OPCODE_ADD: 1062 if (i == 1) { 1063 scan_inst->src[i] = inst->src[0]; 1064 progress = true; 1065 } else if (i == 0 && scan_inst->src[1].file != IMM) { 1066 /* Fit this constant in by commuting the operands. 1067 * Exception: we can't do this for 32-bit integer MUL 1068 * because it's asymmetric. 1069 */ 1070 if (scan_inst->opcode == BRW_OPCODE_MUL && 1071 (scan_inst->src[1].type == BRW_REGISTER_TYPE_D || 1072 scan_inst->src[1].type == BRW_REGISTER_TYPE_UD)) 1073 break; 1074 scan_inst->src[0] = scan_inst->src[1]; 1075 scan_inst->src[1] = inst->src[0]; 1076 progress = true; 1077 } 1078 break; 1079 1080 case BRW_OPCODE_CMP: 1081 case BRW_OPCODE_IF: 1082 if (i == 1) { 1083 scan_inst->src[i] = inst->src[0]; 1084 progress = true; 1085 } else if (i == 0 && scan_inst->src[1].file != IMM) { 1086 uint32_t new_cmod; 1087 1088 new_cmod = brw_swap_cmod(scan_inst->conditional_mod); 1089 if (new_cmod != ~0u) { 1090 /* Fit this constant in by swapping the operands and 1091 * flipping the test 1092 */ 1093 scan_inst->src[0] = scan_inst->src[1]; 1094 scan_inst->src[1] = inst->src[0]; 1095 scan_inst->conditional_mod = new_cmod; 1096 progress = true; 1097 } 1098 } 1099 break; 1100 1101 case BRW_OPCODE_SEL: 1102 if (i == 1) { 1103 scan_inst->src[i] = inst->src[0]; 1104 progress = true; 1105 } else if (i == 0 && scan_inst->src[1].file != IMM) { 1106 scan_inst->src[0] = scan_inst->src[1]; 1107 scan_inst->src[1] = inst->src[0]; 1108 1109 /* If this was predicated, flipping operands means 1110 * we also need to flip the predicate. 1111 */ 1112 if (scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) { 1113 scan_inst->predicate_inverse = 1114 !scan_inst->predicate_inverse; 1115 } 1116 progress = true; 1117 } 1118 break; 1119 1120 case SHADER_OPCODE_RCP: 1121 /* The hardware doesn't do math on immediate values 1122 * (because why are you doing that, seriously?), but 1123 * the correct answer is to just constant fold it 1124 * anyway. 1125 */ 1126 assert(i == 0); 1127 if (inst->src[0].imm.f != 0.0f) { 1128 scan_inst->opcode = BRW_OPCODE_MOV; 1129 scan_inst->src[0] = inst->src[0]; 1130 scan_inst->src[0].imm.f = 1.0f / scan_inst->src[0].imm.f; 1131 progress = true; 1132 } 1133 break; 1134 1135 default: 1136 break; 1137 } 1138 } 1139 1140 if (scan_inst->dst.file == GRF && 1141 scan_inst->dst.reg == inst->dst.reg && 1142 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 1143 scan_inst->is_tex())) { 1144 break; 1145 } 1146 } 1147 } 1148 1149 if (progress) 1150 this->live_intervals_valid = false; 1151 1152 return progress; 1153} 1154 1155 1156/** 1157 * Attempts to move immediate constants into the immediate 1158 * constant slot of following instructions. 1159 * 1160 * Immediate constants are a bit tricky -- they have to be in the last 1161 * operand slot, you can't do abs/negate on them, 1162 */ 1163 1164bool 1165fs_visitor::opt_algebraic() 1166{ 1167 bool progress = false; 1168 1169 calculate_live_intervals(); 1170 1171 foreach_list(node, &this->instructions) { 1172 fs_inst *inst = (fs_inst *)node; 1173 1174 switch (inst->opcode) { 1175 case BRW_OPCODE_MUL: 1176 if (inst->src[1].file != IMM) 1177 continue; 1178 1179 /* a * 1.0 = a */ 1180 if (inst->src[1].type == BRW_REGISTER_TYPE_F && 1181 inst->src[1].imm.f == 1.0) { 1182 inst->opcode = BRW_OPCODE_MOV; 1183 inst->src[1] = reg_undef; 1184 progress = true; 1185 break; 1186 } 1187 1188 break; 1189 default: 1190 break; 1191 } 1192 } 1193 1194 return progress; 1195} 1196 1197/** 1198 * Must be called after calculate_live_intervales() to remove unused 1199 * writes to registers -- register allocation will fail otherwise 1200 * because something deffed but not used won't be considered to 1201 * interfere with other regs. 1202 */ 1203bool 1204fs_visitor::dead_code_eliminate() 1205{ 1206 bool progress = false; 1207 int pc = 0; 1208 1209 calculate_live_intervals(); 1210 1211 foreach_list_safe(node, &this->instructions) { 1212 fs_inst *inst = (fs_inst *)node; 1213 1214 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) { 1215 inst->remove(); 1216 progress = true; 1217 } 1218 1219 pc++; 1220 } 1221 1222 if (progress) 1223 live_intervals_valid = false; 1224 1225 return progress; 1226} 1227 1228/** 1229 * Implements a second type of register coalescing: This one checks if 1230 * the two regs involved in a raw move don't interfere, in which case 1231 * they can both by stored in the same place and the MOV removed. 1232 */ 1233bool 1234fs_visitor::register_coalesce_2() 1235{ 1236 bool progress = false; 1237 1238 calculate_live_intervals(); 1239 1240 foreach_list_safe(node, &this->instructions) { 1241 fs_inst *inst = (fs_inst *)node; 1242 1243 if (inst->opcode != BRW_OPCODE_MOV || 1244 inst->predicated || 1245 inst->saturate || 1246 inst->src[0].file != GRF || 1247 inst->src[0].negate || 1248 inst->src[0].abs || 1249 inst->src[0].smear != -1 || 1250 inst->dst.file != GRF || 1251 inst->dst.type != inst->src[0].type || 1252 virtual_grf_sizes[inst->src[0].reg] != 1 || 1253 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) { 1254 continue; 1255 } 1256 1257 int reg_from = inst->src[0].reg; 1258 assert(inst->src[0].reg_offset == 0); 1259 int reg_to = inst->dst.reg; 1260 int reg_to_offset = inst->dst.reg_offset; 1261 1262 foreach_list_safe(node, &this->instructions) { 1263 fs_inst *scan_inst = (fs_inst *)node; 1264 1265 if (scan_inst->dst.file == GRF && 1266 scan_inst->dst.reg == reg_from) { 1267 scan_inst->dst.reg = reg_to; 1268 scan_inst->dst.reg_offset = reg_to_offset; 1269 } 1270 for (int i = 0; i < 3; i++) { 1271 if (scan_inst->src[i].file == GRF && 1272 scan_inst->src[i].reg == reg_from) { 1273 scan_inst->src[i].reg = reg_to; 1274 scan_inst->src[i].reg_offset = reg_to_offset; 1275 } 1276 } 1277 } 1278 1279 inst->remove(); 1280 live_intervals_valid = false; 1281 progress = true; 1282 continue; 1283 } 1284 1285 return progress; 1286} 1287 1288bool 1289fs_visitor::register_coalesce() 1290{ 1291 bool progress = false; 1292 int if_depth = 0; 1293 int loop_depth = 0; 1294 1295 foreach_list_safe(node, &this->instructions) { 1296 fs_inst *inst = (fs_inst *)node; 1297 1298 /* Make sure that we dominate the instructions we're going to 1299 * scan for interfering with our coalescing, or we won't have 1300 * scanned enough to see if anything interferes with our 1301 * coalescing. We don't dominate the following instructions if 1302 * we're in a loop or an if block. 1303 */ 1304 switch (inst->opcode) { 1305 case BRW_OPCODE_DO: 1306 loop_depth++; 1307 break; 1308 case BRW_OPCODE_WHILE: 1309 loop_depth--; 1310 break; 1311 case BRW_OPCODE_IF: 1312 if_depth++; 1313 break; 1314 case BRW_OPCODE_ENDIF: 1315 if_depth--; 1316 break; 1317 default: 1318 break; 1319 } 1320 if (loop_depth || if_depth) 1321 continue; 1322 1323 if (inst->opcode != BRW_OPCODE_MOV || 1324 inst->predicated || 1325 inst->saturate || 1326 inst->dst.file != GRF || (inst->src[0].file != GRF && 1327 inst->src[0].file != UNIFORM)|| 1328 inst->dst.type != inst->src[0].type) 1329 continue; 1330 1331 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate; 1332 1333 /* Found a move of a GRF to a GRF. Let's see if we can coalesce 1334 * them: check for no writes to either one until the exit of the 1335 * program. 1336 */ 1337 bool interfered = false; 1338 1339 for (fs_inst *scan_inst = (fs_inst *)inst->next; 1340 !scan_inst->is_tail_sentinel(); 1341 scan_inst = (fs_inst *)scan_inst->next) { 1342 if (scan_inst->dst.file == GRF) { 1343 if (scan_inst->dst.reg == inst->dst.reg && 1344 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 1345 scan_inst->is_tex())) { 1346 interfered = true; 1347 break; 1348 } 1349 if (inst->src[0].file == GRF && 1350 scan_inst->dst.reg == inst->src[0].reg && 1351 (scan_inst->dst.reg_offset == inst->src[0].reg_offset || 1352 scan_inst->is_tex())) { 1353 interfered = true; 1354 break; 1355 } 1356 } 1357 1358 /* The gen6 MATH instruction can't handle source modifiers or 1359 * unusual register regions, so avoid coalescing those for 1360 * now. We should do something more specific. 1361 */ 1362 if (intel->gen >= 6 && 1363 scan_inst->is_math() && 1364 (has_source_modifiers || inst->src[0].file == UNIFORM)) { 1365 interfered = true; 1366 break; 1367 } 1368 1369 /* The accumulator result appears to get used for the 1370 * conditional modifier generation. When negating a UD 1371 * value, there is a 33rd bit generated for the sign in the 1372 * accumulator value, so now you can't check, for example, 1373 * equality with a 32-bit value. See piglit fs-op-neg-uint. 1374 */ 1375 if (scan_inst->conditional_mod && 1376 inst->src[0].negate && 1377 inst->src[0].type == BRW_REGISTER_TYPE_UD) { 1378 interfered = true; 1379 break; 1380 } 1381 } 1382 if (interfered) { 1383 continue; 1384 } 1385 1386 /* Rewrite the later usage to point at the source of the move to 1387 * be removed. 1388 */ 1389 for (fs_inst *scan_inst = inst; 1390 !scan_inst->is_tail_sentinel(); 1391 scan_inst = (fs_inst *)scan_inst->next) { 1392 for (int i = 0; i < 3; i++) { 1393 if (scan_inst->src[i].file == GRF && 1394 scan_inst->src[i].reg == inst->dst.reg && 1395 scan_inst->src[i].reg_offset == inst->dst.reg_offset) { 1396 fs_reg new_src = inst->src[0]; 1397 if (scan_inst->src[i].abs) { 1398 new_src.negate = 0; 1399 new_src.abs = 1; 1400 } 1401 new_src.negate ^= scan_inst->src[i].negate; 1402 scan_inst->src[i] = new_src; 1403 } 1404 } 1405 } 1406 1407 inst->remove(); 1408 progress = true; 1409 } 1410 1411 if (progress) 1412 live_intervals_valid = false; 1413 1414 return progress; 1415} 1416 1417 1418bool 1419fs_visitor::compute_to_mrf() 1420{ 1421 bool progress = false; 1422 int next_ip = 0; 1423 1424 calculate_live_intervals(); 1425 1426 foreach_list_safe(node, &this->instructions) { 1427 fs_inst *inst = (fs_inst *)node; 1428 1429 int ip = next_ip; 1430 next_ip++; 1431 1432 if (inst->opcode != BRW_OPCODE_MOV || 1433 inst->predicated || 1434 inst->dst.file != MRF || inst->src[0].file != GRF || 1435 inst->dst.type != inst->src[0].type || 1436 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1) 1437 continue; 1438 1439 /* Work out which hardware MRF registers are written by this 1440 * instruction. 1441 */ 1442 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4; 1443 int mrf_high; 1444 if (inst->dst.reg & BRW_MRF_COMPR4) { 1445 mrf_high = mrf_low + 4; 1446 } else if (c->dispatch_width == 16 && 1447 (!inst->force_uncompressed && !inst->force_sechalf)) { 1448 mrf_high = mrf_low + 1; 1449 } else { 1450 mrf_high = mrf_low; 1451 } 1452 1453 /* Can't compute-to-MRF this GRF if someone else was going to 1454 * read it later. 1455 */ 1456 if (this->virtual_grf_use[inst->src[0].reg] > ip) 1457 continue; 1458 1459 /* Found a move of a GRF to a MRF. Let's see if we can go 1460 * rewrite the thing that made this GRF to write into the MRF. 1461 */ 1462 fs_inst *scan_inst; 1463 for (scan_inst = (fs_inst *)inst->prev; 1464 scan_inst->prev != NULL; 1465 scan_inst = (fs_inst *)scan_inst->prev) { 1466 if (scan_inst->dst.file == GRF && 1467 scan_inst->dst.reg == inst->src[0].reg) { 1468 /* Found the last thing to write our reg we want to turn 1469 * into a compute-to-MRF. 1470 */ 1471 1472 if (scan_inst->is_tex()) { 1473 /* texturing writes several continuous regs, so we can't 1474 * compute-to-mrf that. 1475 */ 1476 break; 1477 } 1478 1479 /* If it's predicated, it (probably) didn't populate all 1480 * the channels. We might be able to rewrite everything 1481 * that writes that reg, but it would require smarter 1482 * tracking to delay the rewriting until complete success. 1483 */ 1484 if (scan_inst->predicated) 1485 break; 1486 1487 /* If it's half of register setup and not the same half as 1488 * our MOV we're trying to remove, bail for now. 1489 */ 1490 if (scan_inst->force_uncompressed != inst->force_uncompressed || 1491 scan_inst->force_sechalf != inst->force_sechalf) { 1492 break; 1493 } 1494 1495 /* SEND instructions can't have MRF as a destination. */ 1496 if (scan_inst->mlen) 1497 break; 1498 1499 if (intel->gen >= 6) { 1500 /* gen6 math instructions must have the destination be 1501 * GRF, so no compute-to-MRF for them. 1502 */ 1503 if (scan_inst->is_math()) { 1504 break; 1505 } 1506 } 1507 1508 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 1509 /* Found the creator of our MRF's source value. */ 1510 scan_inst->dst.file = MRF; 1511 scan_inst->dst.reg = inst->dst.reg; 1512 scan_inst->saturate |= inst->saturate; 1513 inst->remove(); 1514 progress = true; 1515 } 1516 break; 1517 } 1518 1519 /* We don't handle flow control here. Most computation of 1520 * values that end up in MRFs are shortly before the MRF 1521 * write anyway. 1522 */ 1523 if (scan_inst->opcode == BRW_OPCODE_DO || 1524 scan_inst->opcode == BRW_OPCODE_WHILE || 1525 scan_inst->opcode == BRW_OPCODE_ELSE || 1526 scan_inst->opcode == BRW_OPCODE_ENDIF) { 1527 break; 1528 } 1529 1530 /* You can't read from an MRF, so if someone else reads our 1531 * MRF's source GRF that we wanted to rewrite, that stops us. 1532 */ 1533 bool interfered = false; 1534 for (int i = 0; i < 3; i++) { 1535 if (scan_inst->src[i].file == GRF && 1536 scan_inst->src[i].reg == inst->src[0].reg && 1537 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { 1538 interfered = true; 1539 } 1540 } 1541 if (interfered) 1542 break; 1543 1544 if (scan_inst->dst.file == MRF) { 1545 /* If somebody else writes our MRF here, we can't 1546 * compute-to-MRF before that. 1547 */ 1548 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4; 1549 int scan_mrf_high; 1550 1551 if (scan_inst->dst.reg & BRW_MRF_COMPR4) { 1552 scan_mrf_high = scan_mrf_low + 4; 1553 } else if (c->dispatch_width == 16 && 1554 (!scan_inst->force_uncompressed && 1555 !scan_inst->force_sechalf)) { 1556 scan_mrf_high = scan_mrf_low + 1; 1557 } else { 1558 scan_mrf_high = scan_mrf_low; 1559 } 1560 1561 if (mrf_low == scan_mrf_low || 1562 mrf_low == scan_mrf_high || 1563 mrf_high == scan_mrf_low || 1564 mrf_high == scan_mrf_high) { 1565 break; 1566 } 1567 } 1568 1569 if (scan_inst->mlen > 0) { 1570 /* Found a SEND instruction, which means that there are 1571 * live values in MRFs from base_mrf to base_mrf + 1572 * scan_inst->mlen - 1. Don't go pushing our MRF write up 1573 * above it. 1574 */ 1575 if (mrf_low >= scan_inst->base_mrf && 1576 mrf_low < scan_inst->base_mrf + scan_inst->mlen) { 1577 break; 1578 } 1579 if (mrf_high >= scan_inst->base_mrf && 1580 mrf_high < scan_inst->base_mrf + scan_inst->mlen) { 1581 break; 1582 } 1583 } 1584 } 1585 } 1586 1587 return progress; 1588} 1589 1590/** 1591 * Walks through basic blocks, looking for repeated MRF writes and 1592 * removing the later ones. 1593 */ 1594bool 1595fs_visitor::remove_duplicate_mrf_writes() 1596{ 1597 fs_inst *last_mrf_move[16]; 1598 bool progress = false; 1599 1600 /* Need to update the MRF tracking for compressed instructions. */ 1601 if (c->dispatch_width == 16) 1602 return false; 1603 1604 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 1605 1606 foreach_list_safe(node, &this->instructions) { 1607 fs_inst *inst = (fs_inst *)node; 1608 1609 switch (inst->opcode) { 1610 case BRW_OPCODE_DO: 1611 case BRW_OPCODE_WHILE: 1612 case BRW_OPCODE_IF: 1613 case BRW_OPCODE_ELSE: 1614 case BRW_OPCODE_ENDIF: 1615 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 1616 continue; 1617 default: 1618 break; 1619 } 1620 1621 if (inst->opcode == BRW_OPCODE_MOV && 1622 inst->dst.file == MRF) { 1623 fs_inst *prev_inst = last_mrf_move[inst->dst.reg]; 1624 if (prev_inst && inst->equals(prev_inst)) { 1625 inst->remove(); 1626 progress = true; 1627 continue; 1628 } 1629 } 1630 1631 /* Clear out the last-write records for MRFs that were overwritten. */ 1632 if (inst->dst.file == MRF) { 1633 last_mrf_move[inst->dst.reg] = NULL; 1634 } 1635 1636 if (inst->mlen > 0) { 1637 /* Found a SEND instruction, which will include two or fewer 1638 * implied MRF writes. We could do better here. 1639 */ 1640 for (int i = 0; i < implied_mrf_writes(inst); i++) { 1641 last_mrf_move[inst->base_mrf + i] = NULL; 1642 } 1643 } 1644 1645 /* Clear out any MRF move records whose sources got overwritten. */ 1646 if (inst->dst.file == GRF) { 1647 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) { 1648 if (last_mrf_move[i] && 1649 last_mrf_move[i]->src[0].reg == inst->dst.reg) { 1650 last_mrf_move[i] = NULL; 1651 } 1652 } 1653 } 1654 1655 if (inst->opcode == BRW_OPCODE_MOV && 1656 inst->dst.file == MRF && 1657 inst->src[0].file == GRF && 1658 !inst->predicated) { 1659 last_mrf_move[inst->dst.reg] = inst; 1660 } 1661 } 1662 1663 return progress; 1664} 1665 1666/** 1667 * Possibly returns an instruction that set up @param reg. 1668 * 1669 * Sometimes we want to take the result of some expression/variable 1670 * dereference tree and rewrite the instruction generating the result 1671 * of the tree. When processing the tree, we know that the 1672 * instructions generated are all writing temporaries that are dead 1673 * outside of this tree. So, if we have some instructions that write 1674 * a temporary, we're free to point that temp write somewhere else. 1675 * 1676 * Note that this doesn't guarantee that the instruction generated 1677 * only reg -- it might be the size=4 destination of a texture instruction. 1678 */ 1679fs_inst * 1680fs_visitor::get_instruction_generating_reg(fs_inst *start, 1681 fs_inst *end, 1682 fs_reg reg) 1683{ 1684 if (end == start || 1685 end->predicated || 1686 end->force_uncompressed || 1687 end->force_sechalf || 1688 !reg.equals(end->dst)) { 1689 return NULL; 1690 } else { 1691 return end; 1692 } 1693} 1694 1695bool 1696fs_visitor::run() 1697{ 1698 uint32_t prog_offset_16 = 0; 1699 uint32_t orig_nr_params = c->prog_data.nr_params; 1700 1701 brw_wm_payload_setup(brw, c); 1702 1703 if (c->dispatch_width == 16) { 1704 /* align to 64 byte boundary. */ 1705 while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) { 1706 brw_NOP(p); 1707 } 1708 1709 /* Save off the start of this 16-wide program in case we succeed. */ 1710 prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction); 1711 1712 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 1713 } 1714 1715 if (0) { 1716 emit_dummy_fs(); 1717 } else { 1718 calculate_urb_setup(); 1719 if (intel->gen < 6) 1720 emit_interpolation_setup_gen4(); 1721 else 1722 emit_interpolation_setup_gen6(); 1723 1724 /* Generate FS IR for main(). (the visitor only descends into 1725 * functions called "main"). 1726 */ 1727 foreach_list(node, &*shader->ir) { 1728 ir_instruction *ir = (ir_instruction *)node; 1729 base_ir = ir; 1730 this->result = reg_undef; 1731 ir->accept(this); 1732 } 1733 if (failed) 1734 return false; 1735 1736 emit_fb_writes(); 1737 1738 split_virtual_grfs(); 1739 1740 setup_paramvalues_refs(); 1741 setup_pull_constants(); 1742 1743 bool progress; 1744 do { 1745 progress = false; 1746 1747 progress = remove_duplicate_mrf_writes() || progress; 1748 1749 progress = propagate_constants() || progress; 1750 progress = opt_algebraic() || progress; 1751 progress = opt_cse() || progress; 1752 progress = opt_copy_propagate() || progress; 1753 progress = register_coalesce() || progress; 1754 progress = register_coalesce_2() || progress; 1755 progress = compute_to_mrf() || progress; 1756 progress = dead_code_eliminate() || progress; 1757 } while (progress); 1758 1759 remove_dead_constants(); 1760 1761 schedule_instructions(); 1762 1763 assign_curb_setup(); 1764 assign_urb_setup(); 1765 1766 if (0) { 1767 /* Debug of register spilling: Go spill everything. */ 1768 int virtual_grf_count = virtual_grf_next; 1769 for (int i = 0; i < virtual_grf_count; i++) { 1770 spill_reg(i); 1771 } 1772 } 1773 1774 if (0) 1775 assign_regs_trivial(); 1776 else { 1777 while (!assign_regs()) { 1778 if (failed) 1779 break; 1780 } 1781 } 1782 } 1783 assert(force_uncompressed_stack == 0); 1784 assert(force_sechalf_stack == 0); 1785 1786 if (failed) 1787 return false; 1788 1789 generate_code(); 1790 1791 if (c->dispatch_width == 8) { 1792 c->prog_data.reg_blocks = brw_register_blocks(grf_used); 1793 } else { 1794 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used); 1795 c->prog_data.prog_offset_16 = prog_offset_16; 1796 1797 /* Make sure we didn't try to sneak in an extra uniform */ 1798 assert(orig_nr_params == c->prog_data.nr_params); 1799 (void) orig_nr_params; 1800 } 1801 1802 return !failed; 1803} 1804 1805bool 1806brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c, 1807 struct gl_shader_program *prog) 1808{ 1809 struct intel_context *intel = &brw->intel; 1810 1811 if (!prog) 1812 return false; 1813 1814 struct brw_shader *shader = 1815 (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; 1816 if (!shader) 1817 return false; 1818 1819 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 1820 printf("GLSL IR for native fragment shader %d:\n", prog->Name); 1821 _mesa_print_ir(shader->ir, NULL); 1822 printf("\n\n"); 1823 } 1824 1825 /* Now the main event: Visit the shader IR and generate our FS IR for it. 1826 */ 1827 c->dispatch_width = 8; 1828 1829 fs_visitor v(c, prog, shader); 1830 if (!v.run()) { 1831 prog->LinkStatus = false; 1832 ralloc_strcat(&prog->InfoLog, v.fail_msg); 1833 1834 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n", 1835 v.fail_msg); 1836 1837 return false; 1838 } 1839 1840 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) { 1841 c->dispatch_width = 16; 1842 fs_visitor v2(c, prog, shader); 1843 v2.import_uniforms(&v); 1844 v2.run(); 1845 } 1846 1847 c->prog_data.dispatch_width = 8; 1848 1849 return true; 1850} 1851 1852bool 1853brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog) 1854{ 1855 struct brw_context *brw = brw_context(ctx); 1856 struct brw_wm_prog_key key; 1857 1858 /* As a temporary measure we assume that all programs use dFdy() (and hence 1859 * need to be compiled differently depending on whether we're rendering to 1860 * an FBO). FIXME: set this bool correctly based on the contents of the 1861 * program. 1862 */ 1863 bool program_uses_dfdy = true; 1864 1865 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT]) 1866 return true; 1867 1868 struct gl_fragment_program *fp = (struct gl_fragment_program *) 1869 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program; 1870 struct brw_fragment_program *bfp = brw_fragment_program(fp); 1871 1872 memset(&key, 0, sizeof(key)); 1873 1874 if (fp->UsesKill) 1875 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT; 1876 1877 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) 1878 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT; 1879 1880 /* Just assume depth testing. */ 1881 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT; 1882 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT; 1883 1884 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS); 1885 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) { 1886 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i))) 1887 continue; 1888 1889 key.proj_attrib_mask |= 1 << i; 1890 1891 int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i); 1892 1893 if (vp_index >= 0) 1894 key.vp_outputs_written |= BITFIELD64_BIT(vp_index); 1895 } 1896 1897 key.clamp_fragment_color = true; 1898 1899 for (int i = 0; i < BRW_MAX_TEX_UNIT; i++) { 1900 if (fp->Base.ShadowSamplers & (1 << i)) 1901 key.tex.compare_funcs[i] = GL_LESS; 1902 1903 /* FINISHME: depth compares might use (0,0,0,W) for example */ 1904 key.tex.swizzles[i] = SWIZZLE_XYZW; 1905 } 1906 1907 if (fp->Base.InputsRead & FRAG_BIT_WPOS) { 1908 key.drawable_height = ctx->DrawBuffer->Height; 1909 } 1910 1911 if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) { 1912 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer); 1913 } 1914 1915 key.nr_color_regions = 1; 1916 1917 key.program_string_id = bfp->id; 1918 1919 uint32_t old_prog_offset = brw->wm.prog_offset; 1920 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data; 1921 1922 bool success = do_wm_prog(brw, prog, bfp, &key); 1923 1924 brw->wm.prog_offset = old_prog_offset; 1925 brw->wm.prog_data = old_prog_data; 1926 1927 return success; 1928} 1929