brw_fs.cpp revision 89d81ab16c05818b290ed735c1343d3abde449bf
1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Eric Anholt <eric@anholt.net> 25 * 26 */ 27 28extern "C" { 29 30#include <sys/types.h> 31 32#include "main/macros.h" 33#include "main/shaderobj.h" 34#include "main/uniforms.h" 35#include "program/prog_parameter.h" 36#include "program/prog_print.h" 37#include "program/prog_optimize.h" 38#include "program/register_allocate.h" 39#include "program/sampler.h" 40#include "program/hash_table.h" 41#include "brw_context.h" 42#include "brw_eu.h" 43#include "brw_wm.h" 44} 45#include "brw_fs.h" 46#include "../glsl/glsl_types.h" 47#include "../glsl/ir_optimization.h" 48#include "../glsl/ir_print_visitor.h" 49 50#define MAX_INSTRUCTION (1 << 30) 51static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg); 52 53struct gl_shader * 54brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type) 55{ 56 struct brw_shader *shader; 57 58 shader = rzalloc(NULL, struct brw_shader); 59 if (shader) { 60 shader->base.Type = type; 61 shader->base.Name = name; 62 _mesa_init_shader(ctx, &shader->base); 63 } 64 65 return &shader->base; 66} 67 68struct gl_shader_program * 69brw_new_shader_program(struct gl_context *ctx, GLuint name) 70{ 71 struct brw_shader_program *prog; 72 prog = rzalloc(NULL, struct brw_shader_program); 73 if (prog) { 74 prog->base.Name = name; 75 _mesa_init_shader_program(ctx, &prog->base); 76 } 77 return &prog->base; 78} 79 80GLboolean 81brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog) 82{ 83 struct brw_context *brw = brw_context(ctx); 84 struct intel_context *intel = &brw->intel; 85 86 struct brw_shader *shader = 87 (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; 88 if (shader != NULL) { 89 void *mem_ctx = ralloc_context(NULL); 90 bool progress; 91 92 if (shader->ir) 93 ralloc_free(shader->ir); 94 shader->ir = new(shader) exec_list; 95 clone_ir_list(mem_ctx, shader->ir, shader->base.ir); 96 97 do_mat_op_to_vec(shader->ir); 98 lower_instructions(shader->ir, 99 MOD_TO_FRACT | 100 DIV_TO_MUL_RCP | 101 SUB_TO_ADD_NEG | 102 EXP_TO_EXP2 | 103 LOG_TO_LOG2); 104 105 /* Pre-gen6 HW can only nest if-statements 16 deep. Beyond this, 106 * if-statements need to be flattened. 107 */ 108 if (intel->gen < 6) 109 lower_if_to_cond_assign(shader->ir, 16); 110 111 do_lower_texture_projection(shader->ir); 112 do_vec_index_to_cond_assign(shader->ir); 113 brw_do_cubemap_normalize(shader->ir); 114 lower_noise(shader->ir); 115 lower_quadop_vector(shader->ir, false); 116 lower_variable_index_to_cond_assign(shader->ir, 117 GL_TRUE, /* input */ 118 GL_TRUE, /* output */ 119 GL_TRUE, /* temp */ 120 GL_TRUE /* uniform */ 121 ); 122 123 do { 124 progress = false; 125 126 brw_do_channel_expressions(shader->ir); 127 brw_do_vector_splitting(shader->ir); 128 129 progress = do_lower_jumps(shader->ir, true, true, 130 true, /* main return */ 131 false, /* continue */ 132 false /* loops */ 133 ) || progress; 134 135 progress = do_common_optimization(shader->ir, true, 32) || progress; 136 } while (progress); 137 138 validate_ir_tree(shader->ir); 139 140 reparent_ir(shader->ir, shader->ir); 141 ralloc_free(mem_ctx); 142 } 143 144 if (!_mesa_ir_link_shader(ctx, prog)) 145 return GL_FALSE; 146 147 return GL_TRUE; 148} 149 150static int 151type_size(const struct glsl_type *type) 152{ 153 unsigned int size, i; 154 155 switch (type->base_type) { 156 case GLSL_TYPE_UINT: 157 case GLSL_TYPE_INT: 158 case GLSL_TYPE_FLOAT: 159 case GLSL_TYPE_BOOL: 160 return type->components(); 161 case GLSL_TYPE_ARRAY: 162 return type_size(type->fields.array) * type->length; 163 case GLSL_TYPE_STRUCT: 164 size = 0; 165 for (i = 0; i < type->length; i++) { 166 size += type_size(type->fields.structure[i].type); 167 } 168 return size; 169 case GLSL_TYPE_SAMPLER: 170 /* Samplers take up no register space, since they're baked in at 171 * link time. 172 */ 173 return 0; 174 default: 175 assert(!"not reached"); 176 return 0; 177 } 178} 179 180void 181fs_visitor::fail(const char *format, ...) 182{ 183 if (!failed) { 184 failed = true; 185 186 if (INTEL_DEBUG & DEBUG_WM) { 187 fprintf(stderr, "FS compile failed: "); 188 189 va_list va; 190 va_start(va, format); 191 vfprintf(stderr, format, va); 192 va_end(va); 193 } 194 } 195} 196 197/** 198 * Returns how many MRFs an FS opcode will write over. 199 * 200 * Note that this is not the 0 or 1 implied writes in an actual gen 201 * instruction -- the FS opcodes often generate MOVs in addition. 202 */ 203int 204fs_visitor::implied_mrf_writes(fs_inst *inst) 205{ 206 if (inst->mlen == 0) 207 return 0; 208 209 switch (inst->opcode) { 210 case FS_OPCODE_RCP: 211 case FS_OPCODE_RSQ: 212 case FS_OPCODE_SQRT: 213 case FS_OPCODE_EXP2: 214 case FS_OPCODE_LOG2: 215 case FS_OPCODE_SIN: 216 case FS_OPCODE_COS: 217 return 1; 218 case FS_OPCODE_POW: 219 return 2; 220 case FS_OPCODE_TEX: 221 case FS_OPCODE_TXB: 222 case FS_OPCODE_TXD: 223 case FS_OPCODE_TXL: 224 return 1; 225 case FS_OPCODE_FB_WRITE: 226 return 2; 227 case FS_OPCODE_PULL_CONSTANT_LOAD: 228 case FS_OPCODE_UNSPILL: 229 return 1; 230 case FS_OPCODE_SPILL: 231 return 2; 232 default: 233 assert(!"not reached"); 234 return inst->mlen; 235 } 236} 237 238int 239fs_visitor::virtual_grf_alloc(int size) 240{ 241 if (virtual_grf_array_size <= virtual_grf_next) { 242 if (virtual_grf_array_size == 0) 243 virtual_grf_array_size = 16; 244 else 245 virtual_grf_array_size *= 2; 246 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int, 247 virtual_grf_array_size); 248 249 /* This slot is always unused. */ 250 virtual_grf_sizes[0] = 0; 251 } 252 virtual_grf_sizes[virtual_grf_next] = size; 253 return virtual_grf_next++; 254} 255 256/** Fixed HW reg constructor. */ 257fs_reg::fs_reg(enum register_file file, int hw_reg) 258{ 259 init(); 260 this->file = file; 261 this->hw_reg = hw_reg; 262 this->type = BRW_REGISTER_TYPE_F; 263} 264 265/** Fixed HW reg constructor. */ 266fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type) 267{ 268 init(); 269 this->file = file; 270 this->hw_reg = hw_reg; 271 this->type = type; 272} 273 274int 275brw_type_for_base_type(const struct glsl_type *type) 276{ 277 switch (type->base_type) { 278 case GLSL_TYPE_FLOAT: 279 return BRW_REGISTER_TYPE_F; 280 case GLSL_TYPE_INT: 281 case GLSL_TYPE_BOOL: 282 return BRW_REGISTER_TYPE_D; 283 case GLSL_TYPE_UINT: 284 return BRW_REGISTER_TYPE_UD; 285 case GLSL_TYPE_ARRAY: 286 case GLSL_TYPE_STRUCT: 287 case GLSL_TYPE_SAMPLER: 288 /* These should be overridden with the type of the member when 289 * dereferenced into. BRW_REGISTER_TYPE_UD seems like a likely 290 * way to trip up if we don't. 291 */ 292 return BRW_REGISTER_TYPE_UD; 293 default: 294 assert(!"not reached"); 295 return BRW_REGISTER_TYPE_F; 296 } 297} 298 299/** Automatic reg constructor. */ 300fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type) 301{ 302 init(); 303 304 this->file = GRF; 305 this->reg = v->virtual_grf_alloc(type_size(type)); 306 this->reg_offset = 0; 307 this->type = brw_type_for_base_type(type); 308} 309 310fs_reg * 311fs_visitor::variable_storage(ir_variable *var) 312{ 313 return (fs_reg *)hash_table_find(this->variable_ht, var); 314} 315 316/* Our support for uniforms is piggy-backed on the struct 317 * gl_fragment_program, because that's where the values actually 318 * get stored, rather than in some global gl_shader_program uniform 319 * store. 320 */ 321int 322fs_visitor::setup_uniform_values(int loc, const glsl_type *type) 323{ 324 unsigned int offset = 0; 325 326 if (type->is_matrix()) { 327 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT, 328 type->vector_elements, 329 1); 330 331 for (unsigned int i = 0; i < type->matrix_columns; i++) { 332 offset += setup_uniform_values(loc + offset, column); 333 } 334 335 return offset; 336 } 337 338 switch (type->base_type) { 339 case GLSL_TYPE_FLOAT: 340 case GLSL_TYPE_UINT: 341 case GLSL_TYPE_INT: 342 case GLSL_TYPE_BOOL: 343 for (unsigned int i = 0; i < type->vector_elements; i++) { 344 unsigned int param = c->prog_data.nr_params++; 345 346 assert(param < ARRAY_SIZE(c->prog_data.param)); 347 348 switch (type->base_type) { 349 case GLSL_TYPE_FLOAT: 350 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 351 break; 352 case GLSL_TYPE_UINT: 353 c->prog_data.param_convert[param] = PARAM_CONVERT_F2U; 354 break; 355 case GLSL_TYPE_INT: 356 c->prog_data.param_convert[param] = PARAM_CONVERT_F2I; 357 break; 358 case GLSL_TYPE_BOOL: 359 c->prog_data.param_convert[param] = PARAM_CONVERT_F2B; 360 break; 361 default: 362 assert(!"not reached"); 363 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 364 break; 365 } 366 this->param_index[param] = loc; 367 this->param_offset[param] = i; 368 } 369 return 1; 370 371 case GLSL_TYPE_STRUCT: 372 for (unsigned int i = 0; i < type->length; i++) { 373 offset += setup_uniform_values(loc + offset, 374 type->fields.structure[i].type); 375 } 376 return offset; 377 378 case GLSL_TYPE_ARRAY: 379 for (unsigned int i = 0; i < type->length; i++) { 380 offset += setup_uniform_values(loc + offset, type->fields.array); 381 } 382 return offset; 383 384 case GLSL_TYPE_SAMPLER: 385 /* The sampler takes up a slot, but we don't use any values from it. */ 386 return 1; 387 388 default: 389 assert(!"not reached"); 390 return 0; 391 } 392} 393 394 395/* Our support for builtin uniforms is even scarier than non-builtin. 396 * It sits on top of the PROG_STATE_VAR parameters that are 397 * automatically updated from GL context state. 398 */ 399void 400fs_visitor::setup_builtin_uniform_values(ir_variable *ir) 401{ 402 const ir_state_slot *const slots = ir->state_slots; 403 assert(ir->state_slots != NULL); 404 405 { 406 for (unsigned int i = 0; i < ir->num_state_slots; i++) { 407 /* This state reference has already been setup by ir_to_mesa, 408 * but we'll get the same index back here. 409 */ 410 int index = _mesa_add_state_reference(this->fp->Base.Parameters, 411 (gl_state_index *)slots[i].tokens); 412 413 /* Add each of the unique swizzles of the element as a 414 * parameter. This'll end up matching the expected layout of 415 * the array/matrix/structure we're trying to fill in. 416 */ 417 int last_swiz = -1; 418 for (unsigned int j = 0; j < 4; j++) { 419 int swiz = GET_SWZ(slots[i].swizzle, j); 420 if (swiz == last_swiz) 421 break; 422 last_swiz = swiz; 423 424 c->prog_data.param_convert[c->prog_data.nr_params] = 425 PARAM_NO_CONVERT; 426 this->param_index[c->prog_data.nr_params] = index; 427 this->param_offset[c->prog_data.nr_params] = swiz; 428 c->prog_data.nr_params++; 429 } 430 } 431 } 432} 433 434fs_reg * 435fs_visitor::emit_fragcoord_interpolation(ir_variable *ir) 436{ 437 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 438 fs_reg wpos = *reg; 439 fs_reg neg_y = this->pixel_y; 440 neg_y.negate = true; 441 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo; 442 443 /* gl_FragCoord.x */ 444 if (ir->pixel_center_integer) { 445 emit(BRW_OPCODE_MOV, wpos, this->pixel_x); 446 } else { 447 emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f)); 448 } 449 wpos.reg_offset++; 450 451 /* gl_FragCoord.y */ 452 if (!flip && ir->pixel_center_integer) { 453 emit(BRW_OPCODE_MOV, wpos, this->pixel_y); 454 } else { 455 fs_reg pixel_y = this->pixel_y; 456 float offset = (ir->pixel_center_integer ? 0.0 : 0.5); 457 458 if (flip) { 459 pixel_y.negate = true; 460 offset += c->key.drawable_height - 1.0; 461 } 462 463 emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset)); 464 } 465 wpos.reg_offset++; 466 467 /* gl_FragCoord.z */ 468 if (intel->gen >= 6) { 469 emit(BRW_OPCODE_MOV, wpos, 470 fs_reg(brw_vec8_grf(c->source_depth_reg, 0))); 471 } else { 472 emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y, 473 interp_reg(FRAG_ATTRIB_WPOS, 2)); 474 } 475 wpos.reg_offset++; 476 477 /* gl_FragCoord.w: Already set up in emit_interpolation */ 478 emit(BRW_OPCODE_MOV, wpos, this->wpos_w); 479 480 return reg; 481} 482 483fs_reg * 484fs_visitor::emit_general_interpolation(ir_variable *ir) 485{ 486 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 487 /* Interpolation is always in floating point regs. */ 488 reg->type = BRW_REGISTER_TYPE_F; 489 fs_reg attr = *reg; 490 491 unsigned int array_elements; 492 const glsl_type *type; 493 494 if (ir->type->is_array()) { 495 array_elements = ir->type->length; 496 if (array_elements == 0) { 497 fail("dereferenced array '%s' has length 0\n", ir->name); 498 } 499 type = ir->type->fields.array; 500 } else { 501 array_elements = 1; 502 type = ir->type; 503 } 504 505 int location = ir->location; 506 for (unsigned int i = 0; i < array_elements; i++) { 507 for (unsigned int j = 0; j < type->matrix_columns; j++) { 508 if (urb_setup[location] == -1) { 509 /* If there's no incoming setup data for this slot, don't 510 * emit interpolation for it. 511 */ 512 attr.reg_offset += type->vector_elements; 513 location++; 514 continue; 515 } 516 517 bool is_gl_Color = 518 location == FRAG_ATTRIB_COL0 || location == FRAG_ATTRIB_COL1; 519 520 if (c->key.flat_shade && is_gl_Color) { 521 /* Constant interpolation (flat shading) case. The SF has 522 * handed us defined values in only the constant offset 523 * field of the setup reg. 524 */ 525 for (unsigned int k = 0; k < type->vector_elements; k++) { 526 struct brw_reg interp = interp_reg(location, k); 527 interp = suboffset(interp, 3); 528 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp)); 529 attr.reg_offset++; 530 } 531 } else { 532 /* Perspective interpolation case. */ 533 for (unsigned int k = 0; k < type->vector_elements; k++) { 534 struct brw_reg interp = interp_reg(location, k); 535 emit(FS_OPCODE_LINTERP, attr, 536 this->delta_x, this->delta_y, fs_reg(interp)); 537 attr.reg_offset++; 538 } 539 540 if (intel->gen < 6 && !(is_gl_Color && c->key.linear_color)) { 541 attr.reg_offset -= type->vector_elements; 542 for (unsigned int k = 0; k < type->vector_elements; k++) { 543 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w); 544 attr.reg_offset++; 545 } 546 } 547 } 548 location++; 549 } 550 } 551 552 return reg; 553} 554 555fs_reg * 556fs_visitor::emit_frontfacing_interpolation(ir_variable *ir) 557{ 558 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 559 560 /* The frontfacing comes in as a bit in the thread payload. */ 561 if (intel->gen >= 6) { 562 emit(BRW_OPCODE_ASR, *reg, 563 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)), 564 fs_reg(15)); 565 emit(BRW_OPCODE_NOT, *reg, *reg); 566 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1)); 567 } else { 568 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); 569 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives 570 * us front face 571 */ 572 fs_inst *inst = emit(BRW_OPCODE_CMP, *reg, 573 fs_reg(r1_6ud), 574 fs_reg(1u << 31)); 575 inst->conditional_mod = BRW_CONDITIONAL_L; 576 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u)); 577 } 578 579 return reg; 580} 581 582fs_inst * 583fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src) 584{ 585 switch (opcode) { 586 case FS_OPCODE_RCP: 587 case FS_OPCODE_RSQ: 588 case FS_OPCODE_SQRT: 589 case FS_OPCODE_EXP2: 590 case FS_OPCODE_LOG2: 591 case FS_OPCODE_SIN: 592 case FS_OPCODE_COS: 593 break; 594 default: 595 assert(!"not reached: bad math opcode"); 596 return NULL; 597 } 598 599 /* Can't do hstride == 0 args to gen6 math, so expand it out. We 600 * might be able to do better by doing execsize = 1 math and then 601 * expanding that result out, but we would need to be careful with 602 * masking. 603 * 604 * The hardware ignores source modifiers (negate and abs) on math 605 * instructions, so we also move to a temp to set those up. 606 */ 607 if (intel->gen >= 6 && (src.file == UNIFORM || 608 src.abs || 609 src.negate)) { 610 fs_reg expanded = fs_reg(this, glsl_type::float_type); 611 emit(BRW_OPCODE_MOV, expanded, src); 612 src = expanded; 613 } 614 615 fs_inst *inst = emit(opcode, dst, src); 616 617 if (intel->gen < 6) { 618 inst->base_mrf = 2; 619 inst->mlen = 1; 620 } 621 622 return inst; 623} 624 625fs_inst * 626fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1) 627{ 628 int base_mrf = 2; 629 fs_inst *inst; 630 631 assert(opcode == FS_OPCODE_POW); 632 633 if (intel->gen >= 6) { 634 /* Can't do hstride == 0 args to gen6 math, so expand it out. 635 * 636 * The hardware ignores source modifiers (negate and abs) on math 637 * instructions, so we also move to a temp to set those up. 638 */ 639 if (src0.file == UNIFORM || src0.abs || src0.negate) { 640 fs_reg expanded = fs_reg(this, glsl_type::float_type); 641 emit(BRW_OPCODE_MOV, expanded, src0); 642 src0 = expanded; 643 } 644 645 if (src1.file == UNIFORM || src1.abs || src1.negate) { 646 fs_reg expanded = fs_reg(this, glsl_type::float_type); 647 emit(BRW_OPCODE_MOV, expanded, src1); 648 src1 = expanded; 649 } 650 651 inst = emit(opcode, dst, src0, src1); 652 } else { 653 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1); 654 inst = emit(opcode, dst, src0, reg_null_f); 655 656 inst->base_mrf = base_mrf; 657 inst->mlen = 2; 658 } 659 return inst; 660} 661 662void 663fs_visitor::visit(ir_variable *ir) 664{ 665 fs_reg *reg = NULL; 666 667 if (variable_storage(ir)) 668 return; 669 670 if (strcmp(ir->name, "gl_FragColor") == 0) { 671 this->frag_color = ir; 672 } else if (strcmp(ir->name, "gl_FragData") == 0) { 673 this->frag_data = ir; 674 } else if (strcmp(ir->name, "gl_FragDepth") == 0) { 675 this->frag_depth = ir; 676 } 677 678 if (ir->mode == ir_var_in) { 679 if (!strcmp(ir->name, "gl_FragCoord")) { 680 reg = emit_fragcoord_interpolation(ir); 681 } else if (!strcmp(ir->name, "gl_FrontFacing")) { 682 reg = emit_frontfacing_interpolation(ir); 683 } else { 684 reg = emit_general_interpolation(ir); 685 } 686 assert(reg); 687 hash_table_insert(this->variable_ht, reg, ir); 688 return; 689 } 690 691 if (ir->mode == ir_var_uniform) { 692 int param_index = c->prog_data.nr_params; 693 694 if (!strncmp(ir->name, "gl_", 3)) { 695 setup_builtin_uniform_values(ir); 696 } else { 697 setup_uniform_values(ir->location, ir->type); 698 } 699 700 reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index); 701 reg->type = brw_type_for_base_type(ir->type); 702 } 703 704 if (!reg) 705 reg = new(this->mem_ctx) fs_reg(this, ir->type); 706 707 hash_table_insert(this->variable_ht, reg, ir); 708} 709 710void 711fs_visitor::visit(ir_dereference_variable *ir) 712{ 713 fs_reg *reg = variable_storage(ir->var); 714 this->result = *reg; 715} 716 717void 718fs_visitor::visit(ir_dereference_record *ir) 719{ 720 const glsl_type *struct_type = ir->record->type; 721 722 ir->record->accept(this); 723 724 unsigned int offset = 0; 725 for (unsigned int i = 0; i < struct_type->length; i++) { 726 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0) 727 break; 728 offset += type_size(struct_type->fields.structure[i].type); 729 } 730 this->result.reg_offset += offset; 731 this->result.type = brw_type_for_base_type(ir->type); 732} 733 734void 735fs_visitor::visit(ir_dereference_array *ir) 736{ 737 ir_constant *index; 738 int element_size; 739 740 ir->array->accept(this); 741 index = ir->array_index->as_constant(); 742 743 element_size = type_size(ir->type); 744 this->result.type = brw_type_for_base_type(ir->type); 745 746 if (index) { 747 assert(this->result.file == UNIFORM || 748 (this->result.file == GRF && 749 this->result.reg != 0)); 750 this->result.reg_offset += index->value.i[0] * element_size; 751 } else { 752 assert(!"FINISHME: non-constant array element"); 753 } 754} 755 756/* Instruction selection: Produce a MOV.sat instead of 757 * MIN(MAX(val, 0), 1) when possible. 758 */ 759bool 760fs_visitor::try_emit_saturate(ir_expression *ir) 761{ 762 ir_rvalue *sat_val = ir->as_rvalue_to_saturate(); 763 764 if (!sat_val) 765 return false; 766 767 sat_val->accept(this); 768 fs_reg src = this->result; 769 770 this->result = fs_reg(this, ir->type); 771 fs_inst *inst = emit(BRW_OPCODE_MOV, this->result, src); 772 inst->saturate = true; 773 774 return true; 775} 776 777static uint32_t 778brw_conditional_for_comparison(unsigned int op) 779{ 780 switch (op) { 781 case ir_binop_less: 782 return BRW_CONDITIONAL_L; 783 case ir_binop_greater: 784 return BRW_CONDITIONAL_G; 785 case ir_binop_lequal: 786 return BRW_CONDITIONAL_LE; 787 case ir_binop_gequal: 788 return BRW_CONDITIONAL_GE; 789 case ir_binop_equal: 790 case ir_binop_all_equal: /* same as equal for scalars */ 791 return BRW_CONDITIONAL_Z; 792 case ir_binop_nequal: 793 case ir_binop_any_nequal: /* same as nequal for scalars */ 794 return BRW_CONDITIONAL_NZ; 795 default: 796 assert(!"not reached: bad operation for comparison"); 797 return BRW_CONDITIONAL_NZ; 798 } 799} 800 801void 802fs_visitor::visit(ir_expression *ir) 803{ 804 unsigned int operand; 805 fs_reg op[2], temp; 806 fs_inst *inst; 807 808 assert(ir->get_num_operands() <= 2); 809 810 if (try_emit_saturate(ir)) 811 return; 812 813 for (operand = 0; operand < ir->get_num_operands(); operand++) { 814 ir->operands[operand]->accept(this); 815 if (this->result.file == BAD_FILE) { 816 ir_print_visitor v; 817 fail("Failed to get tree for expression operand:\n"); 818 ir->operands[operand]->accept(&v); 819 } 820 op[operand] = this->result; 821 822 /* Matrix expression operands should have been broken down to vector 823 * operations already. 824 */ 825 assert(!ir->operands[operand]->type->is_matrix()); 826 /* And then those vector operands should have been broken down to scalar. 827 */ 828 assert(!ir->operands[operand]->type->is_vector()); 829 } 830 831 /* Storage for our result. If our result goes into an assignment, it will 832 * just get copy-propagated out, so no worries. 833 */ 834 this->result = fs_reg(this, ir->type); 835 836 switch (ir->operation) { 837 case ir_unop_logic_not: 838 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is 839 * ones complement of the whole register, not just bit 0. 840 */ 841 emit(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1)); 842 break; 843 case ir_unop_neg: 844 op[0].negate = !op[0].negate; 845 this->result = op[0]; 846 break; 847 case ir_unop_abs: 848 op[0].abs = true; 849 op[0].negate = false; 850 this->result = op[0]; 851 break; 852 case ir_unop_sign: 853 temp = fs_reg(this, ir->type); 854 855 emit(BRW_OPCODE_MOV, this->result, fs_reg(0.0f)); 856 857 inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f)); 858 inst->conditional_mod = BRW_CONDITIONAL_G; 859 inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(1.0f)); 860 inst->predicated = true; 861 862 inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f)); 863 inst->conditional_mod = BRW_CONDITIONAL_L; 864 inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f)); 865 inst->predicated = true; 866 867 break; 868 case ir_unop_rcp: 869 emit_math(FS_OPCODE_RCP, this->result, op[0]); 870 break; 871 872 case ir_unop_exp2: 873 emit_math(FS_OPCODE_EXP2, this->result, op[0]); 874 break; 875 case ir_unop_log2: 876 emit_math(FS_OPCODE_LOG2, this->result, op[0]); 877 break; 878 case ir_unop_exp: 879 case ir_unop_log: 880 assert(!"not reached: should be handled by ir_explog_to_explog2"); 881 break; 882 case ir_unop_sin: 883 case ir_unop_sin_reduced: 884 emit_math(FS_OPCODE_SIN, this->result, op[0]); 885 break; 886 case ir_unop_cos: 887 case ir_unop_cos_reduced: 888 emit_math(FS_OPCODE_COS, this->result, op[0]); 889 break; 890 891 case ir_unop_dFdx: 892 emit(FS_OPCODE_DDX, this->result, op[0]); 893 break; 894 case ir_unop_dFdy: 895 emit(FS_OPCODE_DDY, this->result, op[0]); 896 break; 897 898 case ir_binop_add: 899 emit(BRW_OPCODE_ADD, this->result, op[0], op[1]); 900 break; 901 case ir_binop_sub: 902 assert(!"not reached: should be handled by ir_sub_to_add_neg"); 903 break; 904 905 case ir_binop_mul: 906 emit(BRW_OPCODE_MUL, this->result, op[0], op[1]); 907 break; 908 case ir_binop_div: 909 assert(!"not reached: should be handled by ir_div_to_mul_rcp"); 910 break; 911 case ir_binop_mod: 912 assert(!"ir_binop_mod should have been converted to b * fract(a/b)"); 913 break; 914 915 case ir_binop_less: 916 case ir_binop_greater: 917 case ir_binop_lequal: 918 case ir_binop_gequal: 919 case ir_binop_equal: 920 case ir_binop_all_equal: 921 case ir_binop_nequal: 922 case ir_binop_any_nequal: 923 temp = this->result; 924 /* original gen4 does implicit conversion before comparison. */ 925 if (intel->gen < 5) 926 temp.type = op[0].type; 927 928 inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]); 929 inst->conditional_mod = brw_conditional_for_comparison(ir->operation); 930 emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)); 931 break; 932 933 case ir_binop_logic_xor: 934 emit(BRW_OPCODE_XOR, this->result, op[0], op[1]); 935 break; 936 937 case ir_binop_logic_or: 938 emit(BRW_OPCODE_OR, this->result, op[0], op[1]); 939 break; 940 941 case ir_binop_logic_and: 942 emit(BRW_OPCODE_AND, this->result, op[0], op[1]); 943 break; 944 945 case ir_binop_dot: 946 case ir_unop_any: 947 assert(!"not reached: should be handled by brw_fs_channel_expressions"); 948 break; 949 950 case ir_unop_noise: 951 assert(!"not reached: should be handled by lower_noise"); 952 break; 953 954 case ir_quadop_vector: 955 assert(!"not reached: should be handled by lower_quadop_vector"); 956 break; 957 958 case ir_unop_sqrt: 959 emit_math(FS_OPCODE_SQRT, this->result, op[0]); 960 break; 961 962 case ir_unop_rsq: 963 emit_math(FS_OPCODE_RSQ, this->result, op[0]); 964 break; 965 966 case ir_unop_i2f: 967 case ir_unop_b2f: 968 case ir_unop_b2i: 969 case ir_unop_f2i: 970 emit(BRW_OPCODE_MOV, this->result, op[0]); 971 break; 972 case ir_unop_f2b: 973 case ir_unop_i2b: 974 temp = this->result; 975 /* original gen4 does implicit conversion before comparison. */ 976 if (intel->gen < 5) 977 temp.type = op[0].type; 978 979 inst = emit(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f)); 980 inst->conditional_mod = BRW_CONDITIONAL_NZ; 981 inst = emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1)); 982 break; 983 984 case ir_unop_trunc: 985 emit(BRW_OPCODE_RNDZ, this->result, op[0]); 986 break; 987 case ir_unop_ceil: 988 op[0].negate = !op[0].negate; 989 inst = emit(BRW_OPCODE_RNDD, this->result, op[0]); 990 this->result.negate = true; 991 break; 992 case ir_unop_floor: 993 inst = emit(BRW_OPCODE_RNDD, this->result, op[0]); 994 break; 995 case ir_unop_fract: 996 inst = emit(BRW_OPCODE_FRC, this->result, op[0]); 997 break; 998 case ir_unop_round_even: 999 emit(BRW_OPCODE_RNDE, this->result, op[0]); 1000 break; 1001 1002 case ir_binop_min: 1003 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]); 1004 inst->conditional_mod = BRW_CONDITIONAL_L; 1005 1006 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]); 1007 inst->predicated = true; 1008 break; 1009 case ir_binop_max: 1010 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]); 1011 inst->conditional_mod = BRW_CONDITIONAL_G; 1012 1013 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]); 1014 inst->predicated = true; 1015 break; 1016 1017 case ir_binop_pow: 1018 emit_math(FS_OPCODE_POW, this->result, op[0], op[1]); 1019 break; 1020 1021 case ir_unop_bit_not: 1022 inst = emit(BRW_OPCODE_NOT, this->result, op[0]); 1023 break; 1024 case ir_binop_bit_and: 1025 inst = emit(BRW_OPCODE_AND, this->result, op[0], op[1]); 1026 break; 1027 case ir_binop_bit_xor: 1028 inst = emit(BRW_OPCODE_XOR, this->result, op[0], op[1]); 1029 break; 1030 case ir_binop_bit_or: 1031 inst = emit(BRW_OPCODE_OR, this->result, op[0], op[1]); 1032 break; 1033 1034 case ir_unop_u2f: 1035 case ir_binop_lshift: 1036 case ir_binop_rshift: 1037 assert(!"GLSL 1.30 features unsupported"); 1038 break; 1039 } 1040} 1041 1042void 1043fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r, 1044 const glsl_type *type, bool predicated) 1045{ 1046 switch (type->base_type) { 1047 case GLSL_TYPE_FLOAT: 1048 case GLSL_TYPE_UINT: 1049 case GLSL_TYPE_INT: 1050 case GLSL_TYPE_BOOL: 1051 for (unsigned int i = 0; i < type->components(); i++) { 1052 l.type = brw_type_for_base_type(type); 1053 r.type = brw_type_for_base_type(type); 1054 1055 fs_inst *inst = emit(BRW_OPCODE_MOV, l, r); 1056 inst->predicated = predicated; 1057 1058 l.reg_offset++; 1059 r.reg_offset++; 1060 } 1061 break; 1062 case GLSL_TYPE_ARRAY: 1063 for (unsigned int i = 0; i < type->length; i++) { 1064 emit_assignment_writes(l, r, type->fields.array, predicated); 1065 } 1066 break; 1067 1068 case GLSL_TYPE_STRUCT: 1069 for (unsigned int i = 0; i < type->length; i++) { 1070 emit_assignment_writes(l, r, type->fields.structure[i].type, 1071 predicated); 1072 } 1073 break; 1074 1075 case GLSL_TYPE_SAMPLER: 1076 break; 1077 1078 default: 1079 assert(!"not reached"); 1080 break; 1081 } 1082} 1083 1084void 1085fs_visitor::visit(ir_assignment *ir) 1086{ 1087 struct fs_reg l, r; 1088 fs_inst *inst; 1089 1090 /* FINISHME: arrays on the lhs */ 1091 ir->lhs->accept(this); 1092 l = this->result; 1093 1094 ir->rhs->accept(this); 1095 r = this->result; 1096 1097 assert(l.file != BAD_FILE); 1098 assert(r.file != BAD_FILE); 1099 1100 if (ir->condition) { 1101 emit_bool_to_cond_code(ir->condition); 1102 } 1103 1104 if (ir->lhs->type->is_scalar() || 1105 ir->lhs->type->is_vector()) { 1106 for (int i = 0; i < ir->lhs->type->vector_elements; i++) { 1107 if (ir->write_mask & (1 << i)) { 1108 inst = emit(BRW_OPCODE_MOV, l, r); 1109 if (ir->condition) 1110 inst->predicated = true; 1111 r.reg_offset++; 1112 } 1113 l.reg_offset++; 1114 } 1115 } else { 1116 emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL); 1117 } 1118} 1119 1120fs_inst * 1121fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate) 1122{ 1123 int mlen; 1124 int base_mrf = 1; 1125 bool simd16 = false; 1126 fs_reg orig_dst; 1127 1128 /* g0 header. */ 1129 mlen = 1; 1130 1131 if (ir->shadow_comparitor) { 1132 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1133 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate); 1134 coordinate.reg_offset++; 1135 } 1136 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */ 1137 mlen += 3; 1138 1139 if (ir->op == ir_tex) { 1140 /* There's no plain shadow compare message, so we use shadow 1141 * compare with a bias of 0.0. 1142 */ 1143 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)); 1144 mlen++; 1145 } else if (ir->op == ir_txb) { 1146 ir->lod_info.bias->accept(this); 1147 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1148 mlen++; 1149 } else { 1150 assert(ir->op == ir_txl); 1151 ir->lod_info.lod->accept(this); 1152 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1153 mlen++; 1154 } 1155 1156 ir->shadow_comparitor->accept(this); 1157 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1158 mlen++; 1159 } else if (ir->op == ir_tex) { 1160 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1161 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate); 1162 coordinate.reg_offset++; 1163 } 1164 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */ 1165 mlen += 3; 1166 } else if (ir->op == ir_txd) { 1167 assert(!"TXD isn't supported on gen4 yet."); 1168 } else { 1169 /* Oh joy. gen4 doesn't have SIMD8 non-shadow-compare bias/lod 1170 * instructions. We'll need to do SIMD16 here. 1171 */ 1172 assert(ir->op == ir_txb || ir->op == ir_txl); 1173 1174 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1175 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2), coordinate); 1176 coordinate.reg_offset++; 1177 } 1178 1179 /* lod/bias appears after u/v/r. */ 1180 mlen += 6; 1181 1182 if (ir->op == ir_txb) { 1183 ir->lod_info.bias->accept(this); 1184 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1185 mlen++; 1186 } else { 1187 ir->lod_info.lod->accept(this); 1188 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1189 mlen++; 1190 } 1191 1192 /* The unused upper half. */ 1193 mlen++; 1194 1195 /* Now, since we're doing simd16, the return is 2 interleaved 1196 * vec4s where the odd-indexed ones are junk. We'll need to move 1197 * this weirdness around to the expected layout. 1198 */ 1199 simd16 = true; 1200 orig_dst = dst; 1201 dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 1202 2)); 1203 dst.type = BRW_REGISTER_TYPE_F; 1204 } 1205 1206 fs_inst *inst = NULL; 1207 switch (ir->op) { 1208 case ir_tex: 1209 inst = emit(FS_OPCODE_TEX, dst); 1210 break; 1211 case ir_txb: 1212 inst = emit(FS_OPCODE_TXB, dst); 1213 break; 1214 case ir_txl: 1215 inst = emit(FS_OPCODE_TXL, dst); 1216 break; 1217 case ir_txd: 1218 inst = emit(FS_OPCODE_TXD, dst); 1219 break; 1220 case ir_txf: 1221 assert(!"GLSL 1.30 features unsupported"); 1222 break; 1223 } 1224 inst->base_mrf = base_mrf; 1225 inst->mlen = mlen; 1226 1227 if (simd16) { 1228 for (int i = 0; i < 4; i++) { 1229 emit(BRW_OPCODE_MOV, orig_dst, dst); 1230 orig_dst.reg_offset++; 1231 dst.reg_offset += 2; 1232 } 1233 } 1234 1235 return inst; 1236} 1237 1238fs_inst * 1239fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate) 1240{ 1241 /* gen5's SIMD8 sampler has slots for u, v, r, array index, then 1242 * optional parameters like shadow comparitor or LOD bias. If 1243 * optional parameters aren't present, those base slots are 1244 * optional and don't need to be included in the message. 1245 * 1246 * We don't fill in the unnecessary slots regardless, which may 1247 * look surprising in the disassembly. 1248 */ 1249 int mlen = 1; /* g0 header always present. */ 1250 int base_mrf = 1; 1251 1252 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1253 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate); 1254 coordinate.reg_offset++; 1255 } 1256 mlen += ir->coordinate->type->vector_elements; 1257 1258 if (ir->shadow_comparitor) { 1259 mlen = MAX2(mlen, 5); 1260 1261 ir->shadow_comparitor->accept(this); 1262 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1263 mlen++; 1264 } 1265 1266 fs_inst *inst = NULL; 1267 switch (ir->op) { 1268 case ir_tex: 1269 inst = emit(FS_OPCODE_TEX, dst); 1270 break; 1271 case ir_txb: 1272 ir->lod_info.bias->accept(this); 1273 mlen = MAX2(mlen, 5); 1274 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1275 mlen++; 1276 1277 inst = emit(FS_OPCODE_TXB, dst); 1278 break; 1279 case ir_txl: 1280 ir->lod_info.lod->accept(this); 1281 mlen = MAX2(mlen, 5); 1282 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1283 mlen++; 1284 1285 inst = emit(FS_OPCODE_TXL, dst); 1286 break; 1287 case ir_txd: 1288 case ir_txf: 1289 assert(!"GLSL 1.30 features unsupported"); 1290 break; 1291 } 1292 inst->base_mrf = base_mrf; 1293 inst->mlen = mlen; 1294 1295 return inst; 1296} 1297 1298void 1299fs_visitor::visit(ir_texture *ir) 1300{ 1301 int sampler; 1302 fs_inst *inst = NULL; 1303 1304 ir->coordinate->accept(this); 1305 fs_reg coordinate = this->result; 1306 1307 if (ir->offset != NULL) { 1308 ir_constant *offset = ir->offset->as_constant(); 1309 assert(offset != NULL); 1310 1311 signed char offsets[3]; 1312 for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) 1313 offsets[i] = (signed char) offset->value.i[i]; 1314 1315 /* Combine all three offsets into a single unsigned dword: 1316 * 1317 * bits 11:8 - U Offset (X component) 1318 * bits 7:4 - V Offset (Y component) 1319 * bits 3:0 - R Offset (Z component) 1320 */ 1321 unsigned offset_bits = 0; 1322 for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) { 1323 const unsigned shift = 4 * (2 - i); 1324 offset_bits |= (offsets[i] << shift) & (0xF << shift); 1325 } 1326 1327 /* Explicitly set up the message header by copying g0 to msg reg m1. */ 1328 emit(BRW_OPCODE_MOV, fs_reg(MRF, 1, BRW_REGISTER_TYPE_UD), 1329 fs_reg(GRF, 0, BRW_REGISTER_TYPE_UD)); 1330 1331 /* Then set the offset bits in DWord 2 of the message header. */ 1332 emit(BRW_OPCODE_MOV, 1333 fs_reg(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 1, 2), 1334 BRW_REGISTER_TYPE_UD)), 1335 fs_reg(brw_imm_uw(offset_bits))); 1336 } 1337 1338 /* Should be lowered by do_lower_texture_projection */ 1339 assert(!ir->projector); 1340 1341 sampler = _mesa_get_sampler_uniform_value(ir->sampler, 1342 ctx->Shader.CurrentFragmentProgram, 1343 &brw->fragment_program->Base); 1344 sampler = c->fp->program.Base.SamplerUnits[sampler]; 1345 1346 /* The 965 requires the EU to do the normalization of GL rectangle 1347 * texture coordinates. We use the program parameter state 1348 * tracking to get the scaling factor. 1349 */ 1350 if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) { 1351 struct gl_program_parameter_list *params = c->fp->program.Base.Parameters; 1352 int tokens[STATE_LENGTH] = { 1353 STATE_INTERNAL, 1354 STATE_TEXRECT_SCALE, 1355 sampler, 1356 0, 1357 0 1358 }; 1359 1360 c->prog_data.param_convert[c->prog_data.nr_params] = 1361 PARAM_NO_CONVERT; 1362 c->prog_data.param_convert[c->prog_data.nr_params + 1] = 1363 PARAM_NO_CONVERT; 1364 1365 fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params); 1366 fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1); 1367 GLuint index = _mesa_add_state_reference(params, 1368 (gl_state_index *)tokens); 1369 1370 this->param_index[c->prog_data.nr_params] = index; 1371 this->param_offset[c->prog_data.nr_params] = 0; 1372 c->prog_data.nr_params++; 1373 this->param_index[c->prog_data.nr_params] = index; 1374 this->param_offset[c->prog_data.nr_params] = 1; 1375 c->prog_data.nr_params++; 1376 1377 fs_reg dst = fs_reg(this, ir->coordinate->type); 1378 fs_reg src = coordinate; 1379 coordinate = dst; 1380 1381 emit(BRW_OPCODE_MUL, dst, src, scale_x); 1382 dst.reg_offset++; 1383 src.reg_offset++; 1384 emit(BRW_OPCODE_MUL, dst, src, scale_y); 1385 } 1386 1387 /* Writemasking doesn't eliminate channels on SIMD8 texture 1388 * samples, so don't worry about them. 1389 */ 1390 fs_reg dst = fs_reg(this, glsl_type::vec4_type); 1391 1392 if (intel->gen < 5) { 1393 inst = emit_texture_gen4(ir, dst, coordinate); 1394 } else { 1395 inst = emit_texture_gen5(ir, dst, coordinate); 1396 } 1397 1398 /* If there's an offset, we already set up m1. To avoid the implied move, 1399 * use the null register. Otherwise, we want an implied move from g0. 1400 */ 1401 if (ir->offset != NULL) 1402 inst->src[0] = fs_reg(brw_null_reg()); 1403 else 1404 inst->src[0] = fs_reg(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW)); 1405 1406 inst->sampler = sampler; 1407 1408 this->result = dst; 1409 1410 if (ir->shadow_comparitor) 1411 inst->shadow_compare = true; 1412 1413 if (ir->type == glsl_type::float_type) { 1414 /* Ignore DEPTH_TEXTURE_MODE swizzling. */ 1415 assert(ir->sampler->type->sampler_shadow); 1416 } else if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) { 1417 fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type); 1418 1419 for (int i = 0; i < 4; i++) { 1420 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i); 1421 fs_reg l = swizzle_dst; 1422 l.reg_offset += i; 1423 1424 if (swiz == SWIZZLE_ZERO) { 1425 emit(BRW_OPCODE_MOV, l, fs_reg(0.0f)); 1426 } else if (swiz == SWIZZLE_ONE) { 1427 emit(BRW_OPCODE_MOV, l, fs_reg(1.0f)); 1428 } else { 1429 fs_reg r = dst; 1430 r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i); 1431 emit(BRW_OPCODE_MOV, l, r); 1432 } 1433 } 1434 this->result = swizzle_dst; 1435 } 1436} 1437 1438void 1439fs_visitor::visit(ir_swizzle *ir) 1440{ 1441 ir->val->accept(this); 1442 fs_reg val = this->result; 1443 1444 if (ir->type->vector_elements == 1) { 1445 this->result.reg_offset += ir->mask.x; 1446 return; 1447 } 1448 1449 fs_reg result = fs_reg(this, ir->type); 1450 this->result = result; 1451 1452 for (unsigned int i = 0; i < ir->type->vector_elements; i++) { 1453 fs_reg channel = val; 1454 int swiz = 0; 1455 1456 switch (i) { 1457 case 0: 1458 swiz = ir->mask.x; 1459 break; 1460 case 1: 1461 swiz = ir->mask.y; 1462 break; 1463 case 2: 1464 swiz = ir->mask.z; 1465 break; 1466 case 3: 1467 swiz = ir->mask.w; 1468 break; 1469 } 1470 1471 channel.reg_offset += swiz; 1472 emit(BRW_OPCODE_MOV, result, channel); 1473 result.reg_offset++; 1474 } 1475} 1476 1477void 1478fs_visitor::visit(ir_discard *ir) 1479{ 1480 fs_reg temp = fs_reg(this, glsl_type::uint_type); 1481 1482 assert(ir->condition == NULL); /* FINISHME */ 1483 1484 emit(FS_OPCODE_DISCARD_NOT, temp, reg_null_d); 1485 emit(FS_OPCODE_DISCARD_AND, reg_null_d, temp); 1486 kill_emitted = true; 1487} 1488 1489void 1490fs_visitor::visit(ir_constant *ir) 1491{ 1492 /* Set this->result to reg at the bottom of the function because some code 1493 * paths will cause this visitor to be applied to other fields. This will 1494 * cause the value stored in this->result to be modified. 1495 * 1496 * Make reg constant so that it doesn't get accidentally modified along the 1497 * way. Yes, I actually had this problem. :( 1498 */ 1499 const fs_reg reg(this, ir->type); 1500 fs_reg dst_reg = reg; 1501 1502 if (ir->type->is_array()) { 1503 const unsigned size = type_size(ir->type->fields.array); 1504 1505 for (unsigned i = 0; i < ir->type->length; i++) { 1506 ir->array_elements[i]->accept(this); 1507 fs_reg src_reg = this->result; 1508 1509 dst_reg.type = src_reg.type; 1510 for (unsigned j = 0; j < size; j++) { 1511 emit(BRW_OPCODE_MOV, dst_reg, src_reg); 1512 src_reg.reg_offset++; 1513 dst_reg.reg_offset++; 1514 } 1515 } 1516 } else if (ir->type->is_record()) { 1517 foreach_list(node, &ir->components) { 1518 ir_instruction *const field = (ir_instruction *) node; 1519 const unsigned size = type_size(field->type); 1520 1521 field->accept(this); 1522 fs_reg src_reg = this->result; 1523 1524 dst_reg.type = src_reg.type; 1525 for (unsigned j = 0; j < size; j++) { 1526 emit(BRW_OPCODE_MOV, dst_reg, src_reg); 1527 src_reg.reg_offset++; 1528 dst_reg.reg_offset++; 1529 } 1530 } 1531 } else { 1532 const unsigned size = type_size(ir->type); 1533 1534 for (unsigned i = 0; i < size; i++) { 1535 switch (ir->type->base_type) { 1536 case GLSL_TYPE_FLOAT: 1537 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i])); 1538 break; 1539 case GLSL_TYPE_UINT: 1540 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i])); 1541 break; 1542 case GLSL_TYPE_INT: 1543 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i])); 1544 break; 1545 case GLSL_TYPE_BOOL: 1546 emit(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i])); 1547 break; 1548 default: 1549 assert(!"Non-float/uint/int/bool constant"); 1550 } 1551 dst_reg.reg_offset++; 1552 } 1553 } 1554 1555 this->result = reg; 1556} 1557 1558void 1559fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir) 1560{ 1561 ir_expression *expr = ir->as_expression(); 1562 1563 if (expr) { 1564 fs_reg op[2]; 1565 fs_inst *inst; 1566 1567 assert(expr->get_num_operands() <= 2); 1568 for (unsigned int i = 0; i < expr->get_num_operands(); i++) { 1569 assert(expr->operands[i]->type->is_scalar()); 1570 1571 expr->operands[i]->accept(this); 1572 op[i] = this->result; 1573 } 1574 1575 switch (expr->operation) { 1576 case ir_unop_logic_not: 1577 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1)); 1578 inst->conditional_mod = BRW_CONDITIONAL_Z; 1579 break; 1580 1581 case ir_binop_logic_xor: 1582 inst = emit(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]); 1583 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1584 break; 1585 1586 case ir_binop_logic_or: 1587 inst = emit(BRW_OPCODE_OR, reg_null_d, op[0], op[1]); 1588 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1589 break; 1590 1591 case ir_binop_logic_and: 1592 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], op[1]); 1593 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1594 break; 1595 1596 case ir_unop_f2b: 1597 if (intel->gen >= 6) { 1598 inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f)); 1599 } else { 1600 inst = emit(BRW_OPCODE_MOV, reg_null_f, op[0]); 1601 } 1602 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1603 break; 1604 1605 case ir_unop_i2b: 1606 if (intel->gen >= 6) { 1607 inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0)); 1608 } else { 1609 inst = emit(BRW_OPCODE_MOV, reg_null_d, op[0]); 1610 } 1611 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1612 break; 1613 1614 case ir_binop_greater: 1615 case ir_binop_gequal: 1616 case ir_binop_less: 1617 case ir_binop_lequal: 1618 case ir_binop_equal: 1619 case ir_binop_all_equal: 1620 case ir_binop_nequal: 1621 case ir_binop_any_nequal: 1622 inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]); 1623 inst->conditional_mod = 1624 brw_conditional_for_comparison(expr->operation); 1625 break; 1626 1627 default: 1628 assert(!"not reached"); 1629 fail("bad cond code\n"); 1630 break; 1631 } 1632 return; 1633 } 1634 1635 ir->accept(this); 1636 1637 if (intel->gen >= 6) { 1638 fs_inst *inst = emit(BRW_OPCODE_AND, reg_null_d, this->result, fs_reg(1)); 1639 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1640 } else { 1641 fs_inst *inst = emit(BRW_OPCODE_MOV, reg_null_d, this->result); 1642 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1643 } 1644} 1645 1646/** 1647 * Emit a gen6 IF statement with the comparison folded into the IF 1648 * instruction. 1649 */ 1650void 1651fs_visitor::emit_if_gen6(ir_if *ir) 1652{ 1653 ir_expression *expr = ir->condition->as_expression(); 1654 1655 if (expr) { 1656 fs_reg op[2]; 1657 fs_inst *inst; 1658 fs_reg temp; 1659 1660 assert(expr->get_num_operands() <= 2); 1661 for (unsigned int i = 0; i < expr->get_num_operands(); i++) { 1662 assert(expr->operands[i]->type->is_scalar()); 1663 1664 expr->operands[i]->accept(this); 1665 op[i] = this->result; 1666 } 1667 1668 switch (expr->operation) { 1669 case ir_unop_logic_not: 1670 inst = emit(BRW_OPCODE_IF, temp, op[0], fs_reg(0)); 1671 inst->conditional_mod = BRW_CONDITIONAL_Z; 1672 return; 1673 1674 case ir_binop_logic_xor: 1675 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]); 1676 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1677 return; 1678 1679 case ir_binop_logic_or: 1680 temp = fs_reg(this, glsl_type::bool_type); 1681 emit(BRW_OPCODE_OR, temp, op[0], op[1]); 1682 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0)); 1683 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1684 return; 1685 1686 case ir_binop_logic_and: 1687 temp = fs_reg(this, glsl_type::bool_type); 1688 emit(BRW_OPCODE_AND, temp, op[0], op[1]); 1689 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0)); 1690 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1691 return; 1692 1693 case ir_unop_f2b: 1694 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0)); 1695 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1696 return; 1697 1698 case ir_unop_i2b: 1699 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0)); 1700 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1701 return; 1702 1703 case ir_binop_greater: 1704 case ir_binop_gequal: 1705 case ir_binop_less: 1706 case ir_binop_lequal: 1707 case ir_binop_equal: 1708 case ir_binop_all_equal: 1709 case ir_binop_nequal: 1710 case ir_binop_any_nequal: 1711 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]); 1712 inst->conditional_mod = 1713 brw_conditional_for_comparison(expr->operation); 1714 return; 1715 default: 1716 assert(!"not reached"); 1717 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0)); 1718 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1719 fail("bad condition\n"); 1720 return; 1721 } 1722 return; 1723 } 1724 1725 ir->condition->accept(this); 1726 1727 fs_inst *inst = emit(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0)); 1728 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1729} 1730 1731void 1732fs_visitor::visit(ir_if *ir) 1733{ 1734 fs_inst *inst; 1735 1736 /* Don't point the annotation at the if statement, because then it plus 1737 * the then and else blocks get printed. 1738 */ 1739 this->base_ir = ir->condition; 1740 1741 if (intel->gen >= 6) { 1742 emit_if_gen6(ir); 1743 } else { 1744 emit_bool_to_cond_code(ir->condition); 1745 1746 inst = emit(BRW_OPCODE_IF); 1747 inst->predicated = true; 1748 } 1749 1750 foreach_iter(exec_list_iterator, iter, ir->then_instructions) { 1751 ir_instruction *ir = (ir_instruction *)iter.get(); 1752 this->base_ir = ir; 1753 1754 ir->accept(this); 1755 } 1756 1757 if (!ir->else_instructions.is_empty()) { 1758 emit(BRW_OPCODE_ELSE); 1759 1760 foreach_iter(exec_list_iterator, iter, ir->else_instructions) { 1761 ir_instruction *ir = (ir_instruction *)iter.get(); 1762 this->base_ir = ir; 1763 1764 ir->accept(this); 1765 } 1766 } 1767 1768 emit(BRW_OPCODE_ENDIF); 1769} 1770 1771void 1772fs_visitor::visit(ir_loop *ir) 1773{ 1774 fs_reg counter = reg_undef; 1775 1776 if (ir->counter) { 1777 this->base_ir = ir->counter; 1778 ir->counter->accept(this); 1779 counter = *(variable_storage(ir->counter)); 1780 1781 if (ir->from) { 1782 this->base_ir = ir->from; 1783 ir->from->accept(this); 1784 1785 emit(BRW_OPCODE_MOV, counter, this->result); 1786 } 1787 } 1788 1789 emit(BRW_OPCODE_DO); 1790 1791 if (ir->to) { 1792 this->base_ir = ir->to; 1793 ir->to->accept(this); 1794 1795 fs_inst *inst = emit(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result); 1796 inst->conditional_mod = brw_conditional_for_comparison(ir->cmp); 1797 1798 inst = emit(BRW_OPCODE_BREAK); 1799 inst->predicated = true; 1800 } 1801 1802 foreach_iter(exec_list_iterator, iter, ir->body_instructions) { 1803 ir_instruction *ir = (ir_instruction *)iter.get(); 1804 1805 this->base_ir = ir; 1806 ir->accept(this); 1807 } 1808 1809 if (ir->increment) { 1810 this->base_ir = ir->increment; 1811 ir->increment->accept(this); 1812 emit(BRW_OPCODE_ADD, counter, counter, this->result); 1813 } 1814 1815 emit(BRW_OPCODE_WHILE); 1816} 1817 1818void 1819fs_visitor::visit(ir_loop_jump *ir) 1820{ 1821 switch (ir->mode) { 1822 case ir_loop_jump::jump_break: 1823 emit(BRW_OPCODE_BREAK); 1824 break; 1825 case ir_loop_jump::jump_continue: 1826 emit(BRW_OPCODE_CONTINUE); 1827 break; 1828 } 1829} 1830 1831void 1832fs_visitor::visit(ir_call *ir) 1833{ 1834 assert(!"FINISHME"); 1835} 1836 1837void 1838fs_visitor::visit(ir_return *ir) 1839{ 1840 assert(!"FINISHME"); 1841} 1842 1843void 1844fs_visitor::visit(ir_function *ir) 1845{ 1846 /* Ignore function bodies other than main() -- we shouldn't see calls to 1847 * them since they should all be inlined before we get to ir_to_mesa. 1848 */ 1849 if (strcmp(ir->name, "main") == 0) { 1850 const ir_function_signature *sig; 1851 exec_list empty; 1852 1853 sig = ir->matching_signature(&empty); 1854 1855 assert(sig); 1856 1857 foreach_iter(exec_list_iterator, iter, sig->body) { 1858 ir_instruction *ir = (ir_instruction *)iter.get(); 1859 this->base_ir = ir; 1860 1861 ir->accept(this); 1862 } 1863 } 1864} 1865 1866void 1867fs_visitor::visit(ir_function_signature *ir) 1868{ 1869 assert(!"not reached"); 1870 (void)ir; 1871} 1872 1873fs_inst * 1874fs_visitor::emit(fs_inst inst) 1875{ 1876 fs_inst *list_inst = new(mem_ctx) fs_inst; 1877 *list_inst = inst; 1878 1879 list_inst->annotation = this->current_annotation; 1880 list_inst->ir = this->base_ir; 1881 1882 this->instructions.push_tail(list_inst); 1883 1884 return list_inst; 1885} 1886 1887/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */ 1888void 1889fs_visitor::emit_dummy_fs() 1890{ 1891 /* Everyone's favorite color. */ 1892 emit(BRW_OPCODE_MOV, fs_reg(MRF, 2), fs_reg(1.0f)); 1893 emit(BRW_OPCODE_MOV, fs_reg(MRF, 3), fs_reg(0.0f)); 1894 emit(BRW_OPCODE_MOV, fs_reg(MRF, 4), fs_reg(1.0f)); 1895 emit(BRW_OPCODE_MOV, fs_reg(MRF, 5), fs_reg(0.0f)); 1896 1897 fs_inst *write; 1898 write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0)); 1899 write->base_mrf = 0; 1900} 1901 1902/* The register location here is relative to the start of the URB 1903 * data. It will get adjusted to be a real location before 1904 * generate_code() time. 1905 */ 1906struct brw_reg 1907fs_visitor::interp_reg(int location, int channel) 1908{ 1909 int regnr = urb_setup[location] * 2 + channel / 2; 1910 int stride = (channel & 1) * 4; 1911 1912 assert(urb_setup[location] != -1); 1913 1914 return brw_vec1_grf(regnr, stride); 1915} 1916 1917/** Emits the interpolation for the varying inputs. */ 1918void 1919fs_visitor::emit_interpolation_setup_gen4() 1920{ 1921 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW); 1922 1923 this->current_annotation = "compute pixel centers"; 1924 this->pixel_x = fs_reg(this, glsl_type::uint_type); 1925 this->pixel_y = fs_reg(this, glsl_type::uint_type); 1926 this->pixel_x.type = BRW_REGISTER_TYPE_UW; 1927 this->pixel_y.type = BRW_REGISTER_TYPE_UW; 1928 emit(BRW_OPCODE_ADD, 1929 this->pixel_x, 1930 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)), 1931 fs_reg(brw_imm_v(0x10101010))); 1932 emit(BRW_OPCODE_ADD, 1933 this->pixel_y, 1934 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)), 1935 fs_reg(brw_imm_v(0x11001100))); 1936 1937 this->current_annotation = "compute pixel deltas from v0"; 1938 if (brw->has_pln) { 1939 this->delta_x = fs_reg(this, glsl_type::vec2_type); 1940 this->delta_y = this->delta_x; 1941 this->delta_y.reg_offset++; 1942 } else { 1943 this->delta_x = fs_reg(this, glsl_type::float_type); 1944 this->delta_y = fs_reg(this, glsl_type::float_type); 1945 } 1946 emit(BRW_OPCODE_ADD, this->delta_x, 1947 this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0)))); 1948 emit(BRW_OPCODE_ADD, this->delta_y, 1949 this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1)))); 1950 1951 this->current_annotation = "compute pos.w and 1/pos.w"; 1952 /* Compute wpos.w. It's always in our setup, since it's needed to 1953 * interpolate the other attributes. 1954 */ 1955 this->wpos_w = fs_reg(this, glsl_type::float_type); 1956 emit(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y, 1957 interp_reg(FRAG_ATTRIB_WPOS, 3)); 1958 /* Compute the pixel 1/W value from wpos.w. */ 1959 this->pixel_w = fs_reg(this, glsl_type::float_type); 1960 emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w); 1961 this->current_annotation = NULL; 1962} 1963 1964/** Emits the interpolation for the varying inputs. */ 1965void 1966fs_visitor::emit_interpolation_setup_gen6() 1967{ 1968 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW); 1969 1970 /* If the pixel centers end up used, the setup is the same as for gen4. */ 1971 this->current_annotation = "compute pixel centers"; 1972 fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type); 1973 fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type); 1974 int_pixel_x.type = BRW_REGISTER_TYPE_UW; 1975 int_pixel_y.type = BRW_REGISTER_TYPE_UW; 1976 emit(BRW_OPCODE_ADD, 1977 int_pixel_x, 1978 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)), 1979 fs_reg(brw_imm_v(0x10101010))); 1980 emit(BRW_OPCODE_ADD, 1981 int_pixel_y, 1982 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)), 1983 fs_reg(brw_imm_v(0x11001100))); 1984 1985 /* As of gen6, we can no longer mix float and int sources. We have 1986 * to turn the integer pixel centers into floats for their actual 1987 * use. 1988 */ 1989 this->pixel_x = fs_reg(this, glsl_type::float_type); 1990 this->pixel_y = fs_reg(this, glsl_type::float_type); 1991 emit(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x); 1992 emit(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y); 1993 1994 this->current_annotation = "compute 1/pos.w"; 1995 this->wpos_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0)); 1996 this->pixel_w = fs_reg(this, glsl_type::float_type); 1997 emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w); 1998 1999 this->delta_x = fs_reg(brw_vec8_grf(2, 0)); 2000 this->delta_y = fs_reg(brw_vec8_grf(3, 0)); 2001 2002 this->current_annotation = NULL; 2003} 2004 2005void 2006fs_visitor::emit_fb_writes() 2007{ 2008 this->current_annotation = "FB write header"; 2009 GLboolean header_present = GL_TRUE; 2010 int nr = 0; 2011 2012 if (intel->gen >= 6 && 2013 !this->kill_emitted && 2014 c->key.nr_color_regions == 1) { 2015 header_present = false; 2016 } 2017 2018 if (header_present) { 2019 /* m0, m1 header */ 2020 nr += 2; 2021 } 2022 2023 if (c->aa_dest_stencil_reg) { 2024 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), 2025 fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0))); 2026 } 2027 2028 /* Reserve space for color. It'll be filled in per MRT below. */ 2029 int color_mrf = nr; 2030 nr += 4; 2031 2032 if (c->source_depth_to_render_target) { 2033 if (c->computes_depth) { 2034 /* Hand over gl_FragDepth. */ 2035 assert(this->frag_depth); 2036 fs_reg depth = *(variable_storage(this->frag_depth)); 2037 2038 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth); 2039 } else { 2040 /* Pass through the payload depth. */ 2041 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), 2042 fs_reg(brw_vec8_grf(c->source_depth_reg, 0))); 2043 } 2044 } 2045 2046 if (c->dest_depth_reg) { 2047 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), 2048 fs_reg(brw_vec8_grf(c->dest_depth_reg, 0))); 2049 } 2050 2051 fs_reg color = reg_undef; 2052 if (this->frag_color) 2053 color = *(variable_storage(this->frag_color)); 2054 else if (this->frag_data) { 2055 color = *(variable_storage(this->frag_data)); 2056 color.type = BRW_REGISTER_TYPE_F; 2057 } 2058 2059 for (int target = 0; target < c->key.nr_color_regions; target++) { 2060 this->current_annotation = ralloc_asprintf(this->mem_ctx, 2061 "FB write target %d", 2062 target); 2063 if (this->frag_color || this->frag_data) { 2064 for (int i = 0; i < 4; i++) { 2065 emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + i), color); 2066 color.reg_offset++; 2067 } 2068 } 2069 2070 if (this->frag_color) 2071 color.reg_offset -= 4; 2072 2073 fs_inst *inst = emit(FS_OPCODE_FB_WRITE); 2074 inst->target = target; 2075 inst->base_mrf = 0; 2076 inst->mlen = nr; 2077 if (target == c->key.nr_color_regions - 1) 2078 inst->eot = true; 2079 inst->header_present = header_present; 2080 } 2081 2082 if (c->key.nr_color_regions == 0) { 2083 if (c->key.alpha_test && (this->frag_color || this->frag_data)) { 2084 /* If the alpha test is enabled but there's no color buffer, 2085 * we still need to send alpha out the pipeline to our null 2086 * renderbuffer. 2087 */ 2088 color.reg_offset += 3; 2089 emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + 3), color); 2090 } 2091 2092 fs_inst *inst = emit(FS_OPCODE_FB_WRITE); 2093 inst->base_mrf = 0; 2094 inst->mlen = nr; 2095 inst->eot = true; 2096 inst->header_present = header_present; 2097 } 2098 2099 this->current_annotation = NULL; 2100} 2101 2102void 2103fs_visitor::generate_fb_write(fs_inst *inst) 2104{ 2105 GLboolean eot = inst->eot; 2106 struct brw_reg implied_header; 2107 2108 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied 2109 * move, here's g1. 2110 */ 2111 brw_push_insn_state(p); 2112 brw_set_mask_control(p, BRW_MASK_DISABLE); 2113 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2114 2115 if (inst->header_present) { 2116 if (intel->gen >= 6) { 2117 brw_MOV(p, 2118 brw_message_reg(inst->base_mrf), 2119 brw_vec8_grf(0, 0)); 2120 2121 if (inst->target > 0) { 2122 /* Set the render target index for choosing BLEND_STATE. */ 2123 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2), 2124 BRW_REGISTER_TYPE_UD), 2125 brw_imm_ud(inst->target)); 2126 } 2127 2128 /* Clear viewport index, render target array index. */ 2129 brw_AND(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 0), 2130 BRW_REGISTER_TYPE_UD), 2131 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), 2132 brw_imm_ud(0xf7ff)); 2133 2134 implied_header = brw_null_reg(); 2135 } else { 2136 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); 2137 } 2138 2139 brw_MOV(p, 2140 brw_message_reg(inst->base_mrf + 1), 2141 brw_vec8_grf(1, 0)); 2142 } else { 2143 implied_header = brw_null_reg(); 2144 } 2145 2146 brw_pop_insn_state(p); 2147 2148 brw_fb_WRITE(p, 2149 8, /* dispatch_width */ 2150 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW), 2151 inst->base_mrf, 2152 implied_header, 2153 inst->target, 2154 inst->mlen, 2155 0, 2156 eot, 2157 inst->header_present); 2158} 2159 2160void 2161fs_visitor::generate_linterp(fs_inst *inst, 2162 struct brw_reg dst, struct brw_reg *src) 2163{ 2164 struct brw_reg delta_x = src[0]; 2165 struct brw_reg delta_y = src[1]; 2166 struct brw_reg interp = src[2]; 2167 2168 if (brw->has_pln && 2169 delta_y.nr == delta_x.nr + 1 && 2170 (intel->gen >= 6 || (delta_x.nr & 1) == 0)) { 2171 brw_PLN(p, dst, interp, delta_x); 2172 } else { 2173 brw_LINE(p, brw_null_reg(), interp, delta_x); 2174 brw_MAC(p, dst, suboffset(interp, 1), delta_y); 2175 } 2176} 2177 2178void 2179fs_visitor::generate_math(fs_inst *inst, 2180 struct brw_reg dst, struct brw_reg *src) 2181{ 2182 int op; 2183 2184 switch (inst->opcode) { 2185 case FS_OPCODE_RCP: 2186 op = BRW_MATH_FUNCTION_INV; 2187 break; 2188 case FS_OPCODE_RSQ: 2189 op = BRW_MATH_FUNCTION_RSQ; 2190 break; 2191 case FS_OPCODE_SQRT: 2192 op = BRW_MATH_FUNCTION_SQRT; 2193 break; 2194 case FS_OPCODE_EXP2: 2195 op = BRW_MATH_FUNCTION_EXP; 2196 break; 2197 case FS_OPCODE_LOG2: 2198 op = BRW_MATH_FUNCTION_LOG; 2199 break; 2200 case FS_OPCODE_POW: 2201 op = BRW_MATH_FUNCTION_POW; 2202 break; 2203 case FS_OPCODE_SIN: 2204 op = BRW_MATH_FUNCTION_SIN; 2205 break; 2206 case FS_OPCODE_COS: 2207 op = BRW_MATH_FUNCTION_COS; 2208 break; 2209 default: 2210 assert(!"not reached: unknown math function"); 2211 op = 0; 2212 break; 2213 } 2214 2215 if (intel->gen >= 6) { 2216 assert(inst->mlen == 0); 2217 2218 if (inst->opcode == FS_OPCODE_POW) { 2219 brw_math2(p, dst, op, src[0], src[1]); 2220 } else { 2221 brw_math(p, dst, 2222 op, 2223 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 2224 BRW_MATH_SATURATE_NONE, 2225 0, src[0], 2226 BRW_MATH_DATA_VECTOR, 2227 BRW_MATH_PRECISION_FULL); 2228 } 2229 } else { 2230 assert(inst->mlen >= 1); 2231 2232 brw_math(p, dst, 2233 op, 2234 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 2235 BRW_MATH_SATURATE_NONE, 2236 inst->base_mrf, src[0], 2237 BRW_MATH_DATA_VECTOR, 2238 BRW_MATH_PRECISION_FULL); 2239 } 2240} 2241 2242void 2243fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 2244{ 2245 int msg_type = -1; 2246 int rlen = 4; 2247 uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; 2248 2249 if (intel->gen >= 5) { 2250 switch (inst->opcode) { 2251 case FS_OPCODE_TEX: 2252 if (inst->shadow_compare) { 2253 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE; 2254 } else { 2255 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE; 2256 } 2257 break; 2258 case FS_OPCODE_TXB: 2259 if (inst->shadow_compare) { 2260 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE; 2261 } else { 2262 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS; 2263 } 2264 break; 2265 case FS_OPCODE_TXL: 2266 if (inst->shadow_compare) { 2267 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE; 2268 } else { 2269 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD; 2270 } 2271 break; 2272 case FS_OPCODE_TXD: 2273 assert(!"TXD isn't supported on gen5+ yet."); 2274 break; 2275 } 2276 } else { 2277 switch (inst->opcode) { 2278 case FS_OPCODE_TEX: 2279 /* Note that G45 and older determines shadow compare and dispatch width 2280 * from message length for most messages. 2281 */ 2282 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE; 2283 if (inst->shadow_compare) { 2284 assert(inst->mlen == 6); 2285 } else { 2286 assert(inst->mlen <= 4); 2287 } 2288 break; 2289 case FS_OPCODE_TXB: 2290 if (inst->shadow_compare) { 2291 assert(inst->mlen == 6); 2292 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE; 2293 } else { 2294 assert(inst->mlen == 9); 2295 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS; 2296 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 2297 } 2298 break; 2299 case FS_OPCODE_TXL: 2300 if (inst->shadow_compare) { 2301 assert(inst->mlen == 6); 2302 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE; 2303 } else { 2304 assert(inst->mlen == 9); 2305 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD; 2306 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 2307 } 2308 break; 2309 case FS_OPCODE_TXD: 2310 assert(!"TXD isn't supported on gen4 yet."); 2311 break; 2312 } 2313 } 2314 assert(msg_type != -1); 2315 2316 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) { 2317 rlen = 8; 2318 dst = vec16(dst); 2319 } 2320 2321 brw_SAMPLE(p, 2322 retype(dst, BRW_REGISTER_TYPE_UW), 2323 inst->base_mrf, 2324 src, 2325 SURF_INDEX_TEXTURE(inst->sampler), 2326 inst->sampler, 2327 WRITEMASK_XYZW, 2328 msg_type, 2329 rlen, 2330 inst->mlen, 2331 0, 2332 1, 2333 simd_mode); 2334} 2335 2336 2337/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input 2338 * looking like: 2339 * 2340 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br 2341 * 2342 * and we're trying to produce: 2343 * 2344 * DDX DDY 2345 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl) 2346 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br) 2347 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl) 2348 * (ss0.br - ss0.bl) (ss0.tr - ss0.br) 2349 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl) 2350 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br) 2351 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl) 2352 * (ss1.br - ss1.bl) (ss1.tr - ss1.br) 2353 * 2354 * and add another set of two more subspans if in 16-pixel dispatch mode. 2355 * 2356 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result 2357 * for each pair, and vertstride = 2 jumps us 2 elements after processing a 2358 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled 2359 * between each other. We could probably do it like ddx and swizzle the right 2360 * order later, but bail for now and just produce 2361 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4) 2362 */ 2363void 2364fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 2365{ 2366 struct brw_reg src0 = brw_reg(src.file, src.nr, 1, 2367 BRW_REGISTER_TYPE_F, 2368 BRW_VERTICAL_STRIDE_2, 2369 BRW_WIDTH_2, 2370 BRW_HORIZONTAL_STRIDE_0, 2371 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2372 struct brw_reg src1 = brw_reg(src.file, src.nr, 0, 2373 BRW_REGISTER_TYPE_F, 2374 BRW_VERTICAL_STRIDE_2, 2375 BRW_WIDTH_2, 2376 BRW_HORIZONTAL_STRIDE_0, 2377 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2378 brw_ADD(p, dst, src0, negate(src1)); 2379} 2380 2381void 2382fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 2383{ 2384 struct brw_reg src0 = brw_reg(src.file, src.nr, 0, 2385 BRW_REGISTER_TYPE_F, 2386 BRW_VERTICAL_STRIDE_4, 2387 BRW_WIDTH_4, 2388 BRW_HORIZONTAL_STRIDE_0, 2389 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2390 struct brw_reg src1 = brw_reg(src.file, src.nr, 2, 2391 BRW_REGISTER_TYPE_F, 2392 BRW_VERTICAL_STRIDE_4, 2393 BRW_WIDTH_4, 2394 BRW_HORIZONTAL_STRIDE_0, 2395 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2396 brw_ADD(p, dst, src0, negate(src1)); 2397} 2398 2399void 2400fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask) 2401{ 2402 if (intel->gen >= 6) { 2403 /* Gen6 no longer has the mask reg for us to just read the 2404 * active channels from. However, cmp updates just the channels 2405 * of the flag reg that are enabled, so we can get at the 2406 * channel enables that way. In this step, make a reg of ones 2407 * we'll compare to. 2408 */ 2409 brw_MOV(p, mask, brw_imm_ud(1)); 2410 } else { 2411 brw_push_insn_state(p); 2412 brw_set_mask_control(p, BRW_MASK_DISABLE); 2413 brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */ 2414 brw_pop_insn_state(p); 2415 } 2416} 2417 2418void 2419fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask) 2420{ 2421 if (intel->gen >= 6) { 2422 struct brw_reg f0 = brw_flag_reg(); 2423 struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); 2424 2425 brw_push_insn_state(p); 2426 brw_set_mask_control(p, BRW_MASK_DISABLE); 2427 brw_MOV(p, f0, brw_imm_uw(0xffff)); /* inactive channels undiscarded */ 2428 brw_pop_insn_state(p); 2429 2430 brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), 2431 BRW_CONDITIONAL_Z, mask, brw_imm_ud(0)); /* active channels fail test */ 2432 /* Undo CMP's whacking of predication*/ 2433 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 2434 2435 brw_push_insn_state(p); 2436 brw_set_mask_control(p, BRW_MASK_DISABLE); 2437 brw_AND(p, g1, f0, g1); 2438 brw_pop_insn_state(p); 2439 } else { 2440 struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); 2441 2442 mask = brw_uw1_reg(mask.file, mask.nr, 0); 2443 2444 brw_push_insn_state(p); 2445 brw_set_mask_control(p, BRW_MASK_DISABLE); 2446 brw_AND(p, g0, mask, g0); 2447 brw_pop_insn_state(p); 2448 } 2449} 2450 2451void 2452fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src) 2453{ 2454 assert(inst->mlen != 0); 2455 2456 brw_MOV(p, 2457 retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD), 2458 retype(src, BRW_REGISTER_TYPE_UD)); 2459 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1, 2460 inst->offset); 2461} 2462 2463void 2464fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst) 2465{ 2466 assert(inst->mlen != 0); 2467 2468 /* Clear any post destination dependencies that would be ignored by 2469 * the block read. See the B-Spec for pre-gen5 send instruction. 2470 * 2471 * This could use a better solution, since texture sampling and 2472 * math reads could potentially run into it as well -- anywhere 2473 * that we have a SEND with a destination that is a register that 2474 * was written but not read within the last N instructions (what's 2475 * N? unsure). This is rare because of dead code elimination, but 2476 * not impossible. 2477 */ 2478 if (intel->gen == 4 && !intel->is_g4x) 2479 brw_MOV(p, brw_null_reg(), dst); 2480 2481 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1, 2482 inst->offset); 2483 2484 if (intel->gen == 4 && !intel->is_g4x) { 2485 /* gen4 errata: destination from a send can't be used as a 2486 * destination until it's been read. Just read it so we don't 2487 * have to worry. 2488 */ 2489 brw_MOV(p, brw_null_reg(), dst); 2490 } 2491} 2492 2493 2494void 2495fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst) 2496{ 2497 assert(inst->mlen != 0); 2498 2499 /* Clear any post destination dependencies that would be ignored by 2500 * the block read. See the B-Spec for pre-gen5 send instruction. 2501 * 2502 * This could use a better solution, since texture sampling and 2503 * math reads could potentially run into it as well -- anywhere 2504 * that we have a SEND with a destination that is a register that 2505 * was written but not read within the last N instructions (what's 2506 * N? unsure). This is rare because of dead code elimination, but 2507 * not impossible. 2508 */ 2509 if (intel->gen == 4 && !intel->is_g4x) 2510 brw_MOV(p, brw_null_reg(), dst); 2511 2512 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf), 2513 inst->offset, SURF_INDEX_FRAG_CONST_BUFFER); 2514 2515 if (intel->gen == 4 && !intel->is_g4x) { 2516 /* gen4 errata: destination from a send can't be used as a 2517 * destination until it's been read. Just read it so we don't 2518 * have to worry. 2519 */ 2520 brw_MOV(p, brw_null_reg(), dst); 2521 } 2522} 2523 2524/** 2525 * To be called after the last _mesa_add_state_reference() call, to 2526 * set up prog_data.param[] for assign_curb_setup() and 2527 * setup_pull_constants(). 2528 */ 2529void 2530fs_visitor::setup_paramvalues_refs() 2531{ 2532 /* Set up the pointers to ParamValues now that that array is finalized. */ 2533 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 2534 c->prog_data.param[i] = 2535 fp->Base.Parameters->ParameterValues[this->param_index[i]] + 2536 this->param_offset[i]; 2537 } 2538} 2539 2540void 2541fs_visitor::assign_curb_setup() 2542{ 2543 c->prog_data.first_curbe_grf = c->nr_payload_regs; 2544 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8; 2545 2546 /* Map the offsets in the UNIFORM file to fixed HW regs. */ 2547 foreach_iter(exec_list_iterator, iter, this->instructions) { 2548 fs_inst *inst = (fs_inst *)iter.get(); 2549 2550 for (unsigned int i = 0; i < 3; i++) { 2551 if (inst->src[i].file == UNIFORM) { 2552 int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 2553 struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf + 2554 constant_nr / 8, 2555 constant_nr % 8); 2556 2557 inst->src[i].file = FIXED_HW_REG; 2558 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type); 2559 } 2560 } 2561 } 2562} 2563 2564void 2565fs_visitor::calculate_urb_setup() 2566{ 2567 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 2568 urb_setup[i] = -1; 2569 } 2570 2571 int urb_next = 0; 2572 /* Figure out where each of the incoming setup attributes lands. */ 2573 if (intel->gen >= 6) { 2574 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 2575 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) { 2576 urb_setup[i] = urb_next++; 2577 } 2578 } 2579 } else { 2580 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */ 2581 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) { 2582 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) { 2583 int fp_index; 2584 2585 if (i >= VERT_RESULT_VAR0) 2586 fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0); 2587 else if (i <= VERT_RESULT_TEX7) 2588 fp_index = i; 2589 else 2590 fp_index = -1; 2591 2592 if (fp_index >= 0) 2593 urb_setup[fp_index] = urb_next++; 2594 } 2595 } 2596 } 2597 2598 /* Each attribute is 4 setup channels, each of which is half a reg. */ 2599 c->prog_data.urb_read_length = urb_next * 2; 2600} 2601 2602void 2603fs_visitor::assign_urb_setup() 2604{ 2605 int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length; 2606 2607 /* Offset all the urb_setup[] index by the actual position of the 2608 * setup regs, now that the location of the constants has been chosen. 2609 */ 2610 foreach_iter(exec_list_iterator, iter, this->instructions) { 2611 fs_inst *inst = (fs_inst *)iter.get(); 2612 2613 if (inst->opcode == FS_OPCODE_LINTERP) { 2614 assert(inst->src[2].file == FIXED_HW_REG); 2615 inst->src[2].fixed_hw_reg.nr += urb_start; 2616 } 2617 2618 if (inst->opcode == FS_OPCODE_CINTERP) { 2619 assert(inst->src[0].file == FIXED_HW_REG); 2620 inst->src[0].fixed_hw_reg.nr += urb_start; 2621 } 2622 } 2623 2624 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length; 2625} 2626 2627/** 2628 * Split large virtual GRFs into separate components if we can. 2629 * 2630 * This is mostly duplicated with what brw_fs_vector_splitting does, 2631 * but that's really conservative because it's afraid of doing 2632 * splitting that doesn't result in real progress after the rest of 2633 * the optimization phases, which would cause infinite looping in 2634 * optimization. We can do it once here, safely. This also has the 2635 * opportunity to split interpolated values, or maybe even uniforms, 2636 * which we don't have at the IR level. 2637 * 2638 * We want to split, because virtual GRFs are what we register 2639 * allocate and spill (due to contiguousness requirements for some 2640 * instructions), and they're what we naturally generate in the 2641 * codegen process, but most virtual GRFs don't actually need to be 2642 * contiguous sets of GRFs. If we split, we'll end up with reduced 2643 * live intervals and better dead code elimination and coalescing. 2644 */ 2645void 2646fs_visitor::split_virtual_grfs() 2647{ 2648 int num_vars = this->virtual_grf_next; 2649 bool split_grf[num_vars]; 2650 int new_virtual_grf[num_vars]; 2651 2652 /* Try to split anything > 0 sized. */ 2653 for (int i = 0; i < num_vars; i++) { 2654 if (this->virtual_grf_sizes[i] != 1) 2655 split_grf[i] = true; 2656 else 2657 split_grf[i] = false; 2658 } 2659 2660 if (brw->has_pln) { 2661 /* PLN opcodes rely on the delta_xy being contiguous. */ 2662 split_grf[this->delta_x.reg] = false; 2663 } 2664 2665 foreach_iter(exec_list_iterator, iter, this->instructions) { 2666 fs_inst *inst = (fs_inst *)iter.get(); 2667 2668 /* Texturing produces 4 contiguous registers, so no splitting. */ 2669 if (inst->is_tex()) { 2670 split_grf[inst->dst.reg] = false; 2671 } 2672 } 2673 2674 /* Allocate new space for split regs. Note that the virtual 2675 * numbers will be contiguous. 2676 */ 2677 for (int i = 0; i < num_vars; i++) { 2678 if (split_grf[i]) { 2679 new_virtual_grf[i] = virtual_grf_alloc(1); 2680 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) { 2681 int reg = virtual_grf_alloc(1); 2682 assert(reg == new_virtual_grf[i] + j - 1); 2683 (void) reg; 2684 } 2685 this->virtual_grf_sizes[i] = 1; 2686 } 2687 } 2688 2689 foreach_iter(exec_list_iterator, iter, this->instructions) { 2690 fs_inst *inst = (fs_inst *)iter.get(); 2691 2692 if (inst->dst.file == GRF && 2693 split_grf[inst->dst.reg] && 2694 inst->dst.reg_offset != 0) { 2695 inst->dst.reg = (new_virtual_grf[inst->dst.reg] + 2696 inst->dst.reg_offset - 1); 2697 inst->dst.reg_offset = 0; 2698 } 2699 for (int i = 0; i < 3; i++) { 2700 if (inst->src[i].file == GRF && 2701 split_grf[inst->src[i].reg] && 2702 inst->src[i].reg_offset != 0) { 2703 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] + 2704 inst->src[i].reg_offset - 1); 2705 inst->src[i].reg_offset = 0; 2706 } 2707 } 2708 } 2709 this->live_intervals_valid = false; 2710} 2711 2712/** 2713 * Choose accesses from the UNIFORM file to demote to using the pull 2714 * constant buffer. 2715 * 2716 * We allow a fragment shader to have more than the specified minimum 2717 * maximum number of fragment shader uniform components (64). If 2718 * there are too many of these, they'd fill up all of register space. 2719 * So, this will push some of them out to the pull constant buffer and 2720 * update the program to load them. 2721 */ 2722void 2723fs_visitor::setup_pull_constants() 2724{ 2725 /* Only allow 16 registers (128 uniform components) as push constants. */ 2726 unsigned int max_uniform_components = 16 * 8; 2727 if (c->prog_data.nr_params <= max_uniform_components) 2728 return; 2729 2730 /* Just demote the end of the list. We could probably do better 2731 * here, demoting things that are rarely used in the program first. 2732 */ 2733 int pull_uniform_base = max_uniform_components; 2734 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base; 2735 2736 foreach_iter(exec_list_iterator, iter, this->instructions) { 2737 fs_inst *inst = (fs_inst *)iter.get(); 2738 2739 for (int i = 0; i < 3; i++) { 2740 if (inst->src[i].file != UNIFORM) 2741 continue; 2742 2743 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 2744 if (uniform_nr < pull_uniform_base) 2745 continue; 2746 2747 fs_reg dst = fs_reg(this, glsl_type::float_type); 2748 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD, 2749 dst); 2750 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15; 2751 pull->ir = inst->ir; 2752 pull->annotation = inst->annotation; 2753 pull->base_mrf = 14; 2754 pull->mlen = 1; 2755 2756 inst->insert_before(pull); 2757 2758 inst->src[i].file = GRF; 2759 inst->src[i].reg = dst.reg; 2760 inst->src[i].reg_offset = 0; 2761 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3; 2762 } 2763 } 2764 2765 for (int i = 0; i < pull_uniform_count; i++) { 2766 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i]; 2767 c->prog_data.pull_param_convert[i] = 2768 c->prog_data.param_convert[pull_uniform_base + i]; 2769 } 2770 c->prog_data.nr_params -= pull_uniform_count; 2771 c->prog_data.nr_pull_params = pull_uniform_count; 2772} 2773 2774void 2775fs_visitor::calculate_live_intervals() 2776{ 2777 int num_vars = this->virtual_grf_next; 2778 int *def = ralloc_array(mem_ctx, int, num_vars); 2779 int *use = ralloc_array(mem_ctx, int, num_vars); 2780 int loop_depth = 0; 2781 int loop_start = 0; 2782 int bb_header_ip = 0; 2783 2784 if (this->live_intervals_valid) 2785 return; 2786 2787 for (int i = 0; i < num_vars; i++) { 2788 def[i] = MAX_INSTRUCTION; 2789 use[i] = -1; 2790 } 2791 2792 int ip = 0; 2793 foreach_iter(exec_list_iterator, iter, this->instructions) { 2794 fs_inst *inst = (fs_inst *)iter.get(); 2795 2796 if (inst->opcode == BRW_OPCODE_DO) { 2797 if (loop_depth++ == 0) 2798 loop_start = ip; 2799 } else if (inst->opcode == BRW_OPCODE_WHILE) { 2800 loop_depth--; 2801 2802 if (loop_depth == 0) { 2803 /* Patches up the use of vars marked for being live across 2804 * the whole loop. 2805 */ 2806 for (int i = 0; i < num_vars; i++) { 2807 if (use[i] == loop_start) { 2808 use[i] = ip; 2809 } 2810 } 2811 } 2812 } else { 2813 for (unsigned int i = 0; i < 3; i++) { 2814 if (inst->src[i].file == GRF && inst->src[i].reg != 0) { 2815 int reg = inst->src[i].reg; 2816 2817 if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 && 2818 def[reg] >= bb_header_ip)) { 2819 use[reg] = ip; 2820 } else { 2821 def[reg] = MIN2(loop_start, def[reg]); 2822 use[reg] = loop_start; 2823 2824 /* Nobody else is going to go smash our start to 2825 * later in the loop now, because def[reg] now 2826 * points before the bb header. 2827 */ 2828 } 2829 } 2830 } 2831 if (inst->dst.file == GRF && inst->dst.reg != 0) { 2832 int reg = inst->dst.reg; 2833 2834 if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 && 2835 !inst->predicated)) { 2836 def[reg] = MIN2(def[reg], ip); 2837 } else { 2838 def[reg] = MIN2(def[reg], loop_start); 2839 } 2840 } 2841 } 2842 2843 ip++; 2844 2845 /* Set the basic block header IP. This is used for determining 2846 * if a complete def of single-register virtual GRF in a loop 2847 * dominates a use in the same basic block. It's a quick way to 2848 * reduce the live interval range of most register used in a 2849 * loop. 2850 */ 2851 if (inst->opcode == BRW_OPCODE_IF || 2852 inst->opcode == BRW_OPCODE_ELSE || 2853 inst->opcode == BRW_OPCODE_ENDIF || 2854 inst->opcode == BRW_OPCODE_DO || 2855 inst->opcode == BRW_OPCODE_WHILE || 2856 inst->opcode == BRW_OPCODE_BREAK || 2857 inst->opcode == BRW_OPCODE_CONTINUE) { 2858 bb_header_ip = ip; 2859 } 2860 } 2861 2862 ralloc_free(this->virtual_grf_def); 2863 ralloc_free(this->virtual_grf_use); 2864 this->virtual_grf_def = def; 2865 this->virtual_grf_use = use; 2866 2867 this->live_intervals_valid = true; 2868} 2869 2870/** 2871 * Attempts to move immediate constants into the immediate 2872 * constant slot of following instructions. 2873 * 2874 * Immediate constants are a bit tricky -- they have to be in the last 2875 * operand slot, you can't do abs/negate on them, 2876 */ 2877 2878bool 2879fs_visitor::propagate_constants() 2880{ 2881 bool progress = false; 2882 2883 calculate_live_intervals(); 2884 2885 foreach_iter(exec_list_iterator, iter, this->instructions) { 2886 fs_inst *inst = (fs_inst *)iter.get(); 2887 2888 if (inst->opcode != BRW_OPCODE_MOV || 2889 inst->predicated || 2890 inst->dst.file != GRF || inst->src[0].file != IMM || 2891 inst->dst.type != inst->src[0].type) 2892 continue; 2893 2894 /* Don't bother with cases where we should have had the 2895 * operation on the constant folded in GLSL already. 2896 */ 2897 if (inst->saturate) 2898 continue; 2899 2900 /* Found a move of a constant to a GRF. Find anything else using the GRF 2901 * before it's written, and replace it with the constant if we can. 2902 */ 2903 exec_list_iterator scan_iter = iter; 2904 scan_iter.next(); 2905 for (; scan_iter.has_next(); scan_iter.next()) { 2906 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 2907 2908 if (scan_inst->opcode == BRW_OPCODE_DO || 2909 scan_inst->opcode == BRW_OPCODE_WHILE || 2910 scan_inst->opcode == BRW_OPCODE_ELSE || 2911 scan_inst->opcode == BRW_OPCODE_ENDIF) { 2912 break; 2913 } 2914 2915 for (int i = 2; i >= 0; i--) { 2916 if (scan_inst->src[i].file != GRF || 2917 scan_inst->src[i].reg != inst->dst.reg || 2918 scan_inst->src[i].reg_offset != inst->dst.reg_offset) 2919 continue; 2920 2921 /* Don't bother with cases where we should have had the 2922 * operation on the constant folded in GLSL already. 2923 */ 2924 if (scan_inst->src[i].negate || scan_inst->src[i].abs) 2925 continue; 2926 2927 switch (scan_inst->opcode) { 2928 case BRW_OPCODE_MOV: 2929 scan_inst->src[i] = inst->src[0]; 2930 progress = true; 2931 break; 2932 2933 case BRW_OPCODE_MUL: 2934 case BRW_OPCODE_ADD: 2935 if (i == 1) { 2936 scan_inst->src[i] = inst->src[0]; 2937 progress = true; 2938 } else if (i == 0 && scan_inst->src[1].file != IMM) { 2939 /* Fit this constant in by commuting the operands */ 2940 scan_inst->src[0] = scan_inst->src[1]; 2941 scan_inst->src[1] = inst->src[0]; 2942 progress = true; 2943 } 2944 break; 2945 case BRW_OPCODE_CMP: 2946 case BRW_OPCODE_SEL: 2947 if (i == 1) { 2948 scan_inst->src[i] = inst->src[0]; 2949 progress = true; 2950 } 2951 } 2952 } 2953 2954 if (scan_inst->dst.file == GRF && 2955 scan_inst->dst.reg == inst->dst.reg && 2956 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 2957 scan_inst->is_tex())) { 2958 break; 2959 } 2960 } 2961 } 2962 2963 if (progress) 2964 this->live_intervals_valid = false; 2965 2966 return progress; 2967} 2968/** 2969 * Must be called after calculate_live_intervales() to remove unused 2970 * writes to registers -- register allocation will fail otherwise 2971 * because something deffed but not used won't be considered to 2972 * interfere with other regs. 2973 */ 2974bool 2975fs_visitor::dead_code_eliminate() 2976{ 2977 bool progress = false; 2978 int pc = 0; 2979 2980 calculate_live_intervals(); 2981 2982 foreach_iter(exec_list_iterator, iter, this->instructions) { 2983 fs_inst *inst = (fs_inst *)iter.get(); 2984 2985 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) { 2986 inst->remove(); 2987 progress = true; 2988 } 2989 2990 pc++; 2991 } 2992 2993 if (progress) 2994 live_intervals_valid = false; 2995 2996 return progress; 2997} 2998 2999bool 3000fs_visitor::register_coalesce() 3001{ 3002 bool progress = false; 3003 int if_depth = 0; 3004 int loop_depth = 0; 3005 3006 foreach_iter(exec_list_iterator, iter, this->instructions) { 3007 fs_inst *inst = (fs_inst *)iter.get(); 3008 3009 /* Make sure that we dominate the instructions we're going to 3010 * scan for interfering with our coalescing, or we won't have 3011 * scanned enough to see if anything interferes with our 3012 * coalescing. We don't dominate the following instructions if 3013 * we're in a loop or an if block. 3014 */ 3015 switch (inst->opcode) { 3016 case BRW_OPCODE_DO: 3017 loop_depth++; 3018 break; 3019 case BRW_OPCODE_WHILE: 3020 loop_depth--; 3021 break; 3022 case BRW_OPCODE_IF: 3023 if_depth++; 3024 break; 3025 case BRW_OPCODE_ENDIF: 3026 if_depth--; 3027 break; 3028 } 3029 if (loop_depth || if_depth) 3030 continue; 3031 3032 if (inst->opcode != BRW_OPCODE_MOV || 3033 inst->predicated || 3034 inst->saturate || 3035 inst->dst.file != GRF || inst->src[0].file != GRF || 3036 inst->dst.type != inst->src[0].type) 3037 continue; 3038 3039 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate; 3040 3041 /* Found a move of a GRF to a GRF. Let's see if we can coalesce 3042 * them: check for no writes to either one until the exit of the 3043 * program. 3044 */ 3045 bool interfered = false; 3046 exec_list_iterator scan_iter = iter; 3047 scan_iter.next(); 3048 for (; scan_iter.has_next(); scan_iter.next()) { 3049 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 3050 3051 if (scan_inst->dst.file == GRF) { 3052 if (scan_inst->dst.reg == inst->dst.reg && 3053 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 3054 scan_inst->is_tex())) { 3055 interfered = true; 3056 break; 3057 } 3058 if (scan_inst->dst.reg == inst->src[0].reg && 3059 (scan_inst->dst.reg_offset == inst->src[0].reg_offset || 3060 scan_inst->is_tex())) { 3061 interfered = true; 3062 break; 3063 } 3064 } 3065 3066 /* The gen6 MATH instruction can't handle source modifiers, so avoid 3067 * coalescing those for now. We should do something more specific. 3068 */ 3069 if (intel->gen == 6 && scan_inst->is_math() && has_source_modifiers) { 3070 interfered = true; 3071 break; 3072 } 3073 } 3074 if (interfered) { 3075 continue; 3076 } 3077 3078 /* Rewrite the later usage to point at the source of the move to 3079 * be removed. 3080 */ 3081 for (exec_list_iterator scan_iter = iter; scan_iter.has_next(); 3082 scan_iter.next()) { 3083 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 3084 3085 for (int i = 0; i < 3; i++) { 3086 if (scan_inst->src[i].file == GRF && 3087 scan_inst->src[i].reg == inst->dst.reg && 3088 scan_inst->src[i].reg_offset == inst->dst.reg_offset) { 3089 scan_inst->src[i].reg = inst->src[0].reg; 3090 scan_inst->src[i].reg_offset = inst->src[0].reg_offset; 3091 scan_inst->src[i].abs |= inst->src[0].abs; 3092 scan_inst->src[i].negate ^= inst->src[0].negate; 3093 scan_inst->src[i].smear = inst->src[0].smear; 3094 } 3095 } 3096 } 3097 3098 inst->remove(); 3099 progress = true; 3100 } 3101 3102 if (progress) 3103 live_intervals_valid = false; 3104 3105 return progress; 3106} 3107 3108 3109bool 3110fs_visitor::compute_to_mrf() 3111{ 3112 bool progress = false; 3113 int next_ip = 0; 3114 3115 calculate_live_intervals(); 3116 3117 foreach_iter(exec_list_iterator, iter, this->instructions) { 3118 fs_inst *inst = (fs_inst *)iter.get(); 3119 3120 int ip = next_ip; 3121 next_ip++; 3122 3123 if (inst->opcode != BRW_OPCODE_MOV || 3124 inst->predicated || 3125 inst->dst.file != MRF || inst->src[0].file != GRF || 3126 inst->dst.type != inst->src[0].type || 3127 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1) 3128 continue; 3129 3130 /* Can't compute-to-MRF this GRF if someone else was going to 3131 * read it later. 3132 */ 3133 if (this->virtual_grf_use[inst->src[0].reg] > ip) 3134 continue; 3135 3136 /* Found a move of a GRF to a MRF. Let's see if we can go 3137 * rewrite the thing that made this GRF to write into the MRF. 3138 */ 3139 fs_inst *scan_inst; 3140 for (scan_inst = (fs_inst *)inst->prev; 3141 scan_inst->prev != NULL; 3142 scan_inst = (fs_inst *)scan_inst->prev) { 3143 if (scan_inst->dst.file == GRF && 3144 scan_inst->dst.reg == inst->src[0].reg) { 3145 /* Found the last thing to write our reg we want to turn 3146 * into a compute-to-MRF. 3147 */ 3148 3149 if (scan_inst->is_tex()) { 3150 /* texturing writes several continuous regs, so we can't 3151 * compute-to-mrf that. 3152 */ 3153 break; 3154 } 3155 3156 /* If it's predicated, it (probably) didn't populate all 3157 * the channels. 3158 */ 3159 if (scan_inst->predicated) 3160 break; 3161 3162 /* SEND instructions can't have MRF as a destination. */ 3163 if (scan_inst->mlen) 3164 break; 3165 3166 if (intel->gen >= 6) { 3167 /* gen6 math instructions must have the destination be 3168 * GRF, so no compute-to-MRF for them. 3169 */ 3170 if (scan_inst->is_math()) { 3171 break; 3172 } 3173 } 3174 3175 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 3176 /* Found the creator of our MRF's source value. */ 3177 scan_inst->dst.file = MRF; 3178 scan_inst->dst.hw_reg = inst->dst.hw_reg; 3179 scan_inst->saturate |= inst->saturate; 3180 inst->remove(); 3181 progress = true; 3182 } 3183 break; 3184 } 3185 3186 /* We don't handle flow control here. Most computation of 3187 * values that end up in MRFs are shortly before the MRF 3188 * write anyway. 3189 */ 3190 if (scan_inst->opcode == BRW_OPCODE_DO || 3191 scan_inst->opcode == BRW_OPCODE_WHILE || 3192 scan_inst->opcode == BRW_OPCODE_ELSE || 3193 scan_inst->opcode == BRW_OPCODE_ENDIF) { 3194 break; 3195 } 3196 3197 /* You can't read from an MRF, so if someone else reads our 3198 * MRF's source GRF that we wanted to rewrite, that stops us. 3199 */ 3200 bool interfered = false; 3201 for (int i = 0; i < 3; i++) { 3202 if (scan_inst->src[i].file == GRF && 3203 scan_inst->src[i].reg == inst->src[0].reg && 3204 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { 3205 interfered = true; 3206 } 3207 } 3208 if (interfered) 3209 break; 3210 3211 if (scan_inst->dst.file == MRF && 3212 scan_inst->dst.hw_reg == inst->dst.hw_reg) { 3213 /* Somebody else wrote our MRF here, so we can't can't 3214 * compute-to-MRF before that. 3215 */ 3216 break; 3217 } 3218 3219 if (scan_inst->mlen > 0) { 3220 /* Found a SEND instruction, which means that there are 3221 * live values in MRFs from base_mrf to base_mrf + 3222 * scan_inst->mlen - 1. Don't go pushing our MRF write up 3223 * above it. 3224 */ 3225 if (inst->dst.hw_reg >= scan_inst->base_mrf && 3226 inst->dst.hw_reg < scan_inst->base_mrf + scan_inst->mlen) { 3227 break; 3228 } 3229 } 3230 } 3231 } 3232 3233 return progress; 3234} 3235 3236/** 3237 * Walks through basic blocks, locking for repeated MRF writes and 3238 * removing the later ones. 3239 */ 3240bool 3241fs_visitor::remove_duplicate_mrf_writes() 3242{ 3243 fs_inst *last_mrf_move[16]; 3244 bool progress = false; 3245 3246 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 3247 3248 foreach_iter(exec_list_iterator, iter, this->instructions) { 3249 fs_inst *inst = (fs_inst *)iter.get(); 3250 3251 switch (inst->opcode) { 3252 case BRW_OPCODE_DO: 3253 case BRW_OPCODE_WHILE: 3254 case BRW_OPCODE_IF: 3255 case BRW_OPCODE_ELSE: 3256 case BRW_OPCODE_ENDIF: 3257 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 3258 continue; 3259 default: 3260 break; 3261 } 3262 3263 if (inst->opcode == BRW_OPCODE_MOV && 3264 inst->dst.file == MRF) { 3265 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg]; 3266 if (prev_inst && inst->equals(prev_inst)) { 3267 inst->remove(); 3268 progress = true; 3269 continue; 3270 } 3271 } 3272 3273 /* Clear out the last-write records for MRFs that were overwritten. */ 3274 if (inst->dst.file == MRF) { 3275 last_mrf_move[inst->dst.hw_reg] = NULL; 3276 } 3277 3278 if (inst->mlen > 0) { 3279 /* Found a SEND instruction, which will include two or fewer 3280 * implied MRF writes. We could do better here. 3281 */ 3282 for (int i = 0; i < implied_mrf_writes(inst); i++) { 3283 last_mrf_move[inst->base_mrf + i] = NULL; 3284 } 3285 } 3286 3287 /* Clear out any MRF move records whose sources got overwritten. */ 3288 if (inst->dst.file == GRF) { 3289 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) { 3290 if (last_mrf_move[i] && 3291 last_mrf_move[i]->src[0].reg == inst->dst.reg) { 3292 last_mrf_move[i] = NULL; 3293 } 3294 } 3295 } 3296 3297 if (inst->opcode == BRW_OPCODE_MOV && 3298 inst->dst.file == MRF && 3299 inst->src[0].file == GRF && 3300 !inst->predicated) { 3301 last_mrf_move[inst->dst.hw_reg] = inst; 3302 } 3303 } 3304 3305 return progress; 3306} 3307 3308bool 3309fs_visitor::virtual_grf_interferes(int a, int b) 3310{ 3311 int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]); 3312 int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]); 3313 3314 /* We can't handle dead register writes here, without iterating 3315 * over the whole instruction stream to find every single dead 3316 * write to that register to compare to the live interval of the 3317 * other register. Just assert that dead_code_eliminate() has been 3318 * called. 3319 */ 3320 assert((this->virtual_grf_use[a] != -1 || 3321 this->virtual_grf_def[a] == MAX_INSTRUCTION) && 3322 (this->virtual_grf_use[b] != -1 || 3323 this->virtual_grf_def[b] == MAX_INSTRUCTION)); 3324 3325 return start < end; 3326} 3327 3328static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg) 3329{ 3330 struct brw_reg brw_reg; 3331 3332 switch (reg->file) { 3333 case GRF: 3334 case ARF: 3335 case MRF: 3336 if (reg->smear == -1) { 3337 brw_reg = brw_vec8_reg(reg->file, 3338 reg->hw_reg, 0); 3339 } else { 3340 brw_reg = brw_vec1_reg(reg->file, 3341 reg->hw_reg, reg->smear); 3342 } 3343 brw_reg = retype(brw_reg, reg->type); 3344 break; 3345 case IMM: 3346 switch (reg->type) { 3347 case BRW_REGISTER_TYPE_F: 3348 brw_reg = brw_imm_f(reg->imm.f); 3349 break; 3350 case BRW_REGISTER_TYPE_D: 3351 brw_reg = brw_imm_d(reg->imm.i); 3352 break; 3353 case BRW_REGISTER_TYPE_UD: 3354 brw_reg = brw_imm_ud(reg->imm.u); 3355 break; 3356 default: 3357 assert(!"not reached"); 3358 brw_reg = brw_null_reg(); 3359 break; 3360 } 3361 break; 3362 case FIXED_HW_REG: 3363 brw_reg = reg->fixed_hw_reg; 3364 break; 3365 case BAD_FILE: 3366 /* Probably unused. */ 3367 brw_reg = brw_null_reg(); 3368 break; 3369 case UNIFORM: 3370 assert(!"not reached"); 3371 brw_reg = brw_null_reg(); 3372 break; 3373 default: 3374 assert(!"not reached"); 3375 brw_reg = brw_null_reg(); 3376 break; 3377 } 3378 if (reg->abs) 3379 brw_reg = brw_abs(brw_reg); 3380 if (reg->negate) 3381 brw_reg = negate(brw_reg); 3382 3383 return brw_reg; 3384} 3385 3386void 3387fs_visitor::generate_code() 3388{ 3389 int last_native_inst = 0; 3390 const char *last_annotation_string = NULL; 3391 ir_instruction *last_annotation_ir = NULL; 3392 3393 int if_stack_array_size = 16; 3394 int loop_stack_array_size = 16; 3395 int if_stack_depth = 0, loop_stack_depth = 0; 3396 brw_instruction **if_stack = 3397 rzalloc_array(this->mem_ctx, brw_instruction *, if_stack_array_size); 3398 brw_instruction **loop_stack = 3399 rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size); 3400 int *if_depth_in_loop = 3401 rzalloc_array(this->mem_ctx, int, loop_stack_array_size); 3402 3403 3404 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3405 printf("Native code for fragment shader %d:\n", 3406 ctx->Shader.CurrentFragmentProgram->Name); 3407 } 3408 3409 foreach_iter(exec_list_iterator, iter, this->instructions) { 3410 fs_inst *inst = (fs_inst *)iter.get(); 3411 struct brw_reg src[3], dst; 3412 3413 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3414 if (last_annotation_ir != inst->ir) { 3415 last_annotation_ir = inst->ir; 3416 if (last_annotation_ir) { 3417 printf(" "); 3418 last_annotation_ir->print(); 3419 printf("\n"); 3420 } 3421 } 3422 if (last_annotation_string != inst->annotation) { 3423 last_annotation_string = inst->annotation; 3424 if (last_annotation_string) 3425 printf(" %s\n", last_annotation_string); 3426 } 3427 } 3428 3429 for (unsigned int i = 0; i < 3; i++) { 3430 src[i] = brw_reg_from_fs_reg(&inst->src[i]); 3431 } 3432 dst = brw_reg_from_fs_reg(&inst->dst); 3433 3434 brw_set_conditionalmod(p, inst->conditional_mod); 3435 brw_set_predicate_control(p, inst->predicated); 3436 brw_set_saturate(p, inst->saturate); 3437 3438 switch (inst->opcode) { 3439 case BRW_OPCODE_MOV: 3440 brw_MOV(p, dst, src[0]); 3441 break; 3442 case BRW_OPCODE_ADD: 3443 brw_ADD(p, dst, src[0], src[1]); 3444 break; 3445 case BRW_OPCODE_MUL: 3446 brw_MUL(p, dst, src[0], src[1]); 3447 break; 3448 3449 case BRW_OPCODE_FRC: 3450 brw_FRC(p, dst, src[0]); 3451 break; 3452 case BRW_OPCODE_RNDD: 3453 brw_RNDD(p, dst, src[0]); 3454 break; 3455 case BRW_OPCODE_RNDE: 3456 brw_RNDE(p, dst, src[0]); 3457 break; 3458 case BRW_OPCODE_RNDZ: 3459 brw_RNDZ(p, dst, src[0]); 3460 break; 3461 3462 case BRW_OPCODE_AND: 3463 brw_AND(p, dst, src[0], src[1]); 3464 break; 3465 case BRW_OPCODE_OR: 3466 brw_OR(p, dst, src[0], src[1]); 3467 break; 3468 case BRW_OPCODE_XOR: 3469 brw_XOR(p, dst, src[0], src[1]); 3470 break; 3471 case BRW_OPCODE_NOT: 3472 brw_NOT(p, dst, src[0]); 3473 break; 3474 case BRW_OPCODE_ASR: 3475 brw_ASR(p, dst, src[0], src[1]); 3476 break; 3477 case BRW_OPCODE_SHR: 3478 brw_SHR(p, dst, src[0], src[1]); 3479 break; 3480 case BRW_OPCODE_SHL: 3481 brw_SHL(p, dst, src[0], src[1]); 3482 break; 3483 3484 case BRW_OPCODE_CMP: 3485 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); 3486 break; 3487 case BRW_OPCODE_SEL: 3488 brw_SEL(p, dst, src[0], src[1]); 3489 break; 3490 3491 case BRW_OPCODE_IF: 3492 if (inst->src[0].file != BAD_FILE) { 3493 assert(intel->gen >= 6); 3494 if_stack[if_stack_depth] = gen6_IF(p, inst->conditional_mod, src[0], src[1]); 3495 } else { 3496 if_stack[if_stack_depth] = brw_IF(p, BRW_EXECUTE_8); 3497 } 3498 if_depth_in_loop[loop_stack_depth]++; 3499 if_stack_depth++; 3500 if (if_stack_array_size <= if_stack_depth) { 3501 if_stack_array_size *= 2; 3502 if_stack = reralloc(this->mem_ctx, if_stack, brw_instruction *, 3503 if_stack_array_size); 3504 } 3505 break; 3506 3507 case BRW_OPCODE_ELSE: 3508 if_stack[if_stack_depth - 1] = 3509 brw_ELSE(p, if_stack[if_stack_depth - 1]); 3510 break; 3511 case BRW_OPCODE_ENDIF: 3512 if_stack_depth--; 3513 brw_ENDIF(p , if_stack[if_stack_depth]); 3514 if_depth_in_loop[loop_stack_depth]--; 3515 break; 3516 3517 case BRW_OPCODE_DO: 3518 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8); 3519 if (loop_stack_array_size <= loop_stack_depth) { 3520 loop_stack_array_size *= 2; 3521 loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *, 3522 loop_stack_array_size); 3523 if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int, 3524 loop_stack_array_size); 3525 } 3526 if_depth_in_loop[loop_stack_depth] = 0; 3527 break; 3528 3529 case BRW_OPCODE_BREAK: 3530 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]); 3531 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 3532 break; 3533 case BRW_OPCODE_CONTINUE: 3534 /* FINISHME: We need to write the loop instruction support still. */ 3535 if (intel->gen >= 6) 3536 gen6_CONT(p, loop_stack[loop_stack_depth - 1]); 3537 else 3538 brw_CONT(p, if_depth_in_loop[loop_stack_depth]); 3539 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 3540 break; 3541 3542 case BRW_OPCODE_WHILE: { 3543 struct brw_instruction *inst0, *inst1; 3544 GLuint br = 1; 3545 3546 if (intel->gen >= 5) 3547 br = 2; 3548 3549 assert(loop_stack_depth > 0); 3550 loop_stack_depth--; 3551 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]); 3552 if (intel->gen < 6) { 3553 /* patch all the BREAK/CONT instructions from last BGNLOOP */ 3554 while (inst0 > loop_stack[loop_stack_depth]) { 3555 inst0--; 3556 if (inst0->header.opcode == BRW_OPCODE_BREAK && 3557 inst0->bits3.if_else.jump_count == 0) { 3558 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1); 3559 } 3560 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE && 3561 inst0->bits3.if_else.jump_count == 0) { 3562 inst0->bits3.if_else.jump_count = br * (inst1 - inst0); 3563 } 3564 } 3565 } 3566 } 3567 break; 3568 3569 case FS_OPCODE_RCP: 3570 case FS_OPCODE_RSQ: 3571 case FS_OPCODE_SQRT: 3572 case FS_OPCODE_EXP2: 3573 case FS_OPCODE_LOG2: 3574 case FS_OPCODE_POW: 3575 case FS_OPCODE_SIN: 3576 case FS_OPCODE_COS: 3577 generate_math(inst, dst, src); 3578 break; 3579 case FS_OPCODE_CINTERP: 3580 brw_MOV(p, dst, src[0]); 3581 break; 3582 case FS_OPCODE_LINTERP: 3583 generate_linterp(inst, dst, src); 3584 break; 3585 case FS_OPCODE_TEX: 3586 case FS_OPCODE_TXB: 3587 case FS_OPCODE_TXD: 3588 case FS_OPCODE_TXL: 3589 generate_tex(inst, dst, src[0]); 3590 break; 3591 case FS_OPCODE_DISCARD_NOT: 3592 generate_discard_not(inst, dst); 3593 break; 3594 case FS_OPCODE_DISCARD_AND: 3595 generate_discard_and(inst, src[0]); 3596 break; 3597 case FS_OPCODE_DDX: 3598 generate_ddx(inst, dst, src[0]); 3599 break; 3600 case FS_OPCODE_DDY: 3601 generate_ddy(inst, dst, src[0]); 3602 break; 3603 3604 case FS_OPCODE_SPILL: 3605 generate_spill(inst, src[0]); 3606 break; 3607 3608 case FS_OPCODE_UNSPILL: 3609 generate_unspill(inst, dst); 3610 break; 3611 3612 case FS_OPCODE_PULL_CONSTANT_LOAD: 3613 generate_pull_constant_load(inst, dst); 3614 break; 3615 3616 case FS_OPCODE_FB_WRITE: 3617 generate_fb_write(inst); 3618 break; 3619 default: 3620 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) { 3621 _mesa_problem(ctx, "Unsupported opcode `%s' in FS", 3622 brw_opcodes[inst->opcode].name); 3623 } else { 3624 _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode); 3625 } 3626 fail("unsupported opcode in FS\n"); 3627 } 3628 3629 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3630 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) { 3631 if (0) { 3632 printf("0x%08x 0x%08x 0x%08x 0x%08x ", 3633 ((uint32_t *)&p->store[i])[3], 3634 ((uint32_t *)&p->store[i])[2], 3635 ((uint32_t *)&p->store[i])[1], 3636 ((uint32_t *)&p->store[i])[0]); 3637 } 3638 brw_disasm(stdout, &p->store[i], intel->gen); 3639 } 3640 } 3641 3642 last_native_inst = p->nr_insn; 3643 } 3644 3645 ralloc_free(if_stack); 3646 ralloc_free(loop_stack); 3647 ralloc_free(if_depth_in_loop); 3648 3649 brw_set_uip_jip(p); 3650 3651 /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS 3652 * emit issues, it doesn't get the jump distances into the output, 3653 * which is often something we want to debug. So this is here in 3654 * case you're doing that. 3655 */ 3656 if (0) { 3657 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3658 for (unsigned int i = 0; i < p->nr_insn; i++) { 3659 printf("0x%08x 0x%08x 0x%08x 0x%08x ", 3660 ((uint32_t *)&p->store[i])[3], 3661 ((uint32_t *)&p->store[i])[2], 3662 ((uint32_t *)&p->store[i])[1], 3663 ((uint32_t *)&p->store[i])[0]); 3664 brw_disasm(stdout, &p->store[i], intel->gen); 3665 } 3666 } 3667 } 3668} 3669 3670GLboolean 3671brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c) 3672{ 3673 struct intel_context *intel = &brw->intel; 3674 struct gl_context *ctx = &intel->ctx; 3675 struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram; 3676 3677 if (!prog) 3678 return GL_FALSE; 3679 3680 struct brw_shader *shader = 3681 (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; 3682 if (!shader) 3683 return GL_FALSE; 3684 3685 /* We always use 8-wide mode, at least for now. For one, flow 3686 * control only works in 8-wide. Also, when we're fragment shader 3687 * bound, we're almost always under register pressure as well, so 3688 * 8-wide would save us from the performance cliff of spilling 3689 * regs. 3690 */ 3691 c->dispatch_width = 8; 3692 3693 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3694 printf("GLSL IR for native fragment shader %d:\n", prog->Name); 3695 _mesa_print_ir(shader->ir, NULL); 3696 printf("\n"); 3697 } 3698 3699 /* Now the main event: Visit the shader IR and generate our FS IR for it. 3700 */ 3701 fs_visitor v(c, shader); 3702 3703 if (0) { 3704 v.emit_dummy_fs(); 3705 } else { 3706 v.calculate_urb_setup(); 3707 if (intel->gen < 6) 3708 v.emit_interpolation_setup_gen4(); 3709 else 3710 v.emit_interpolation_setup_gen6(); 3711 3712 /* Generate FS IR for main(). (the visitor only descends into 3713 * functions called "main"). 3714 */ 3715 foreach_iter(exec_list_iterator, iter, *shader->ir) { 3716 ir_instruction *ir = (ir_instruction *)iter.get(); 3717 v.base_ir = ir; 3718 ir->accept(&v); 3719 } 3720 3721 v.emit_fb_writes(); 3722 3723 v.split_virtual_grfs(); 3724 3725 v.setup_paramvalues_refs(); 3726 v.setup_pull_constants(); 3727 3728 bool progress; 3729 do { 3730 progress = false; 3731 3732 progress = v.remove_duplicate_mrf_writes() || progress; 3733 3734 progress = v.propagate_constants() || progress; 3735 progress = v.register_coalesce() || progress; 3736 progress = v.compute_to_mrf() || progress; 3737 progress = v.dead_code_eliminate() || progress; 3738 } while (progress); 3739 3740 v.schedule_instructions(); 3741 3742 v.assign_curb_setup(); 3743 v.assign_urb_setup(); 3744 3745 if (0) { 3746 /* Debug of register spilling: Go spill everything. */ 3747 int virtual_grf_count = v.virtual_grf_next; 3748 for (int i = 1; i < virtual_grf_count; i++) { 3749 v.spill_reg(i); 3750 } 3751 } 3752 3753 if (0) 3754 v.assign_regs_trivial(); 3755 else { 3756 while (!v.assign_regs()) { 3757 if (v.failed) 3758 break; 3759 } 3760 } 3761 } 3762 3763 if (!v.failed) 3764 v.generate_code(); 3765 3766 assert(!v.failed); /* FINISHME: Cleanly fail, tested at link time, etc. */ 3767 3768 if (v.failed) 3769 return GL_FALSE; 3770 3771 c->prog_data.total_grf = v.grf_used; 3772 3773 return GL_TRUE; 3774} 3775