brw_fs.cpp revision 963431829055f63ec94d88c97a5d07d30e49833a
1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Eric Anholt <eric@anholt.net> 25 * 26 */ 27 28extern "C" { 29 30#include <sys/types.h> 31 32#include "main/macros.h" 33#include "main/shaderobj.h" 34#include "main/uniforms.h" 35#include "program/prog_parameter.h" 36#include "program/prog_print.h" 37#include "program/prog_optimize.h" 38#include "program/register_allocate.h" 39#include "program/sampler.h" 40#include "program/hash_table.h" 41#include "brw_context.h" 42#include "brw_eu.h" 43#include "brw_wm.h" 44} 45#include "brw_fs.h" 46#include "../glsl/glsl_types.h" 47#include "../glsl/ir_optimization.h" 48#include "../glsl/ir_print_visitor.h" 49 50#define MAX_INSTRUCTION (1 << 30) 51static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg); 52 53struct gl_shader * 54brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type) 55{ 56 struct brw_shader *shader; 57 58 shader = rzalloc(NULL, struct brw_shader); 59 if (shader) { 60 shader->base.Type = type; 61 shader->base.Name = name; 62 _mesa_init_shader(ctx, &shader->base); 63 } 64 65 return &shader->base; 66} 67 68struct gl_shader_program * 69brw_new_shader_program(struct gl_context *ctx, GLuint name) 70{ 71 struct brw_shader_program *prog; 72 prog = rzalloc(NULL, struct brw_shader_program); 73 if (prog) { 74 prog->base.Name = name; 75 _mesa_init_shader_program(ctx, &prog->base); 76 } 77 return &prog->base; 78} 79 80GLboolean 81brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog) 82{ 83 struct brw_context *brw = brw_context(ctx); 84 struct intel_context *intel = &brw->intel; 85 86 struct brw_shader *shader = 87 (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; 88 if (shader != NULL) { 89 void *mem_ctx = ralloc_context(NULL); 90 bool progress; 91 92 if (shader->ir) 93 ralloc_free(shader->ir); 94 shader->ir = new(shader) exec_list; 95 clone_ir_list(mem_ctx, shader->ir, shader->base.ir); 96 97 do_mat_op_to_vec(shader->ir); 98 lower_instructions(shader->ir, 99 MOD_TO_FRACT | 100 DIV_TO_MUL_RCP | 101 SUB_TO_ADD_NEG | 102 EXP_TO_EXP2 | 103 LOG_TO_LOG2); 104 105 /* Pre-gen6 HW can only nest if-statements 16 deep. Beyond this, 106 * if-statements need to be flattened. 107 */ 108 if (intel->gen < 6) 109 lower_if_to_cond_assign(shader->ir, 16); 110 111 do_lower_texture_projection(shader->ir); 112 do_vec_index_to_cond_assign(shader->ir); 113 brw_do_cubemap_normalize(shader->ir); 114 lower_noise(shader->ir); 115 lower_quadop_vector(shader->ir, false); 116 lower_variable_index_to_cond_assign(shader->ir, 117 GL_TRUE, /* input */ 118 GL_TRUE, /* output */ 119 GL_TRUE, /* temp */ 120 GL_TRUE /* uniform */ 121 ); 122 123 do { 124 progress = false; 125 126 brw_do_channel_expressions(shader->ir); 127 brw_do_vector_splitting(shader->ir); 128 129 progress = do_lower_jumps(shader->ir, true, true, 130 true, /* main return */ 131 false, /* continue */ 132 false /* loops */ 133 ) || progress; 134 135 progress = do_common_optimization(shader->ir, true, 32) || progress; 136 } while (progress); 137 138 validate_ir_tree(shader->ir); 139 140 reparent_ir(shader->ir, shader->ir); 141 ralloc_free(mem_ctx); 142 } 143 144 if (!_mesa_ir_link_shader(ctx, prog)) 145 return GL_FALSE; 146 147 return GL_TRUE; 148} 149 150static int 151type_size(const struct glsl_type *type) 152{ 153 unsigned int size, i; 154 155 switch (type->base_type) { 156 case GLSL_TYPE_UINT: 157 case GLSL_TYPE_INT: 158 case GLSL_TYPE_FLOAT: 159 case GLSL_TYPE_BOOL: 160 return type->components(); 161 case GLSL_TYPE_ARRAY: 162 return type_size(type->fields.array) * type->length; 163 case GLSL_TYPE_STRUCT: 164 size = 0; 165 for (i = 0; i < type->length; i++) { 166 size += type_size(type->fields.structure[i].type); 167 } 168 return size; 169 case GLSL_TYPE_SAMPLER: 170 /* Samplers take up no register space, since they're baked in at 171 * link time. 172 */ 173 return 0; 174 default: 175 assert(!"not reached"); 176 return 0; 177 } 178} 179 180void 181fs_visitor::fail(const char *format, ...) 182{ 183 if (!failed) { 184 failed = true; 185 186 if (INTEL_DEBUG & DEBUG_WM) { 187 fprintf(stderr, "FS compile failed: "); 188 189 va_list va; 190 va_start(va, format); 191 vfprintf(stderr, format, va); 192 va_end(va); 193 } 194 } 195} 196 197/** 198 * Returns how many MRFs an FS opcode will write over. 199 * 200 * Note that this is not the 0 or 1 implied writes in an actual gen 201 * instruction -- the FS opcodes often generate MOVs in addition. 202 */ 203int 204fs_visitor::implied_mrf_writes(fs_inst *inst) 205{ 206 if (inst->mlen == 0) 207 return 0; 208 209 switch (inst->opcode) { 210 case FS_OPCODE_RCP: 211 case FS_OPCODE_RSQ: 212 case FS_OPCODE_SQRT: 213 case FS_OPCODE_EXP2: 214 case FS_OPCODE_LOG2: 215 case FS_OPCODE_SIN: 216 case FS_OPCODE_COS: 217 return 1; 218 case FS_OPCODE_POW: 219 return 2; 220 case FS_OPCODE_TEX: 221 case FS_OPCODE_TXB: 222 case FS_OPCODE_TXD: 223 case FS_OPCODE_TXL: 224 return 1; 225 case FS_OPCODE_FB_WRITE: 226 return 2; 227 case FS_OPCODE_PULL_CONSTANT_LOAD: 228 case FS_OPCODE_UNSPILL: 229 return 1; 230 case FS_OPCODE_SPILL: 231 return 2; 232 default: 233 assert(!"not reached"); 234 return inst->mlen; 235 } 236} 237 238int 239fs_visitor::virtual_grf_alloc(int size) 240{ 241 if (virtual_grf_array_size <= virtual_grf_next) { 242 if (virtual_grf_array_size == 0) 243 virtual_grf_array_size = 16; 244 else 245 virtual_grf_array_size *= 2; 246 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int, 247 virtual_grf_array_size); 248 249 /* This slot is always unused. */ 250 virtual_grf_sizes[0] = 0; 251 } 252 virtual_grf_sizes[virtual_grf_next] = size; 253 return virtual_grf_next++; 254} 255 256/** Fixed HW reg constructor. */ 257fs_reg::fs_reg(enum register_file file, int hw_reg) 258{ 259 init(); 260 this->file = file; 261 this->hw_reg = hw_reg; 262 this->type = BRW_REGISTER_TYPE_F; 263} 264 265/** Fixed HW reg constructor. */ 266fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type) 267{ 268 init(); 269 this->file = file; 270 this->hw_reg = hw_reg; 271 this->type = type; 272} 273 274int 275brw_type_for_base_type(const struct glsl_type *type) 276{ 277 switch (type->base_type) { 278 case GLSL_TYPE_FLOAT: 279 return BRW_REGISTER_TYPE_F; 280 case GLSL_TYPE_INT: 281 case GLSL_TYPE_BOOL: 282 return BRW_REGISTER_TYPE_D; 283 case GLSL_TYPE_UINT: 284 return BRW_REGISTER_TYPE_UD; 285 case GLSL_TYPE_ARRAY: 286 case GLSL_TYPE_STRUCT: 287 case GLSL_TYPE_SAMPLER: 288 /* These should be overridden with the type of the member when 289 * dereferenced into. BRW_REGISTER_TYPE_UD seems like a likely 290 * way to trip up if we don't. 291 */ 292 return BRW_REGISTER_TYPE_UD; 293 default: 294 assert(!"not reached"); 295 return BRW_REGISTER_TYPE_F; 296 } 297} 298 299/** Automatic reg constructor. */ 300fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type) 301{ 302 init(); 303 304 this->file = GRF; 305 this->reg = v->virtual_grf_alloc(type_size(type)); 306 this->reg_offset = 0; 307 this->type = brw_type_for_base_type(type); 308} 309 310fs_reg * 311fs_visitor::variable_storage(ir_variable *var) 312{ 313 return (fs_reg *)hash_table_find(this->variable_ht, var); 314} 315 316/* Our support for uniforms is piggy-backed on the struct 317 * gl_fragment_program, because that's where the values actually 318 * get stored, rather than in some global gl_shader_program uniform 319 * store. 320 */ 321int 322fs_visitor::setup_uniform_values(int loc, const glsl_type *type) 323{ 324 unsigned int offset = 0; 325 326 if (type->is_matrix()) { 327 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT, 328 type->vector_elements, 329 1); 330 331 for (unsigned int i = 0; i < type->matrix_columns; i++) { 332 offset += setup_uniform_values(loc + offset, column); 333 } 334 335 return offset; 336 } 337 338 switch (type->base_type) { 339 case GLSL_TYPE_FLOAT: 340 case GLSL_TYPE_UINT: 341 case GLSL_TYPE_INT: 342 case GLSL_TYPE_BOOL: 343 for (unsigned int i = 0; i < type->vector_elements; i++) { 344 unsigned int param = c->prog_data.nr_params++; 345 346 assert(param < ARRAY_SIZE(c->prog_data.param)); 347 348 switch (type->base_type) { 349 case GLSL_TYPE_FLOAT: 350 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 351 break; 352 case GLSL_TYPE_UINT: 353 c->prog_data.param_convert[param] = PARAM_CONVERT_F2U; 354 break; 355 case GLSL_TYPE_INT: 356 c->prog_data.param_convert[param] = PARAM_CONVERT_F2I; 357 break; 358 case GLSL_TYPE_BOOL: 359 c->prog_data.param_convert[param] = PARAM_CONVERT_F2B; 360 break; 361 default: 362 assert(!"not reached"); 363 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 364 break; 365 } 366 this->param_index[param] = loc; 367 this->param_offset[param] = i; 368 } 369 return 1; 370 371 case GLSL_TYPE_STRUCT: 372 for (unsigned int i = 0; i < type->length; i++) { 373 offset += setup_uniform_values(loc + offset, 374 type->fields.structure[i].type); 375 } 376 return offset; 377 378 case GLSL_TYPE_ARRAY: 379 for (unsigned int i = 0; i < type->length; i++) { 380 offset += setup_uniform_values(loc + offset, type->fields.array); 381 } 382 return offset; 383 384 case GLSL_TYPE_SAMPLER: 385 /* The sampler takes up a slot, but we don't use any values from it. */ 386 return 1; 387 388 default: 389 assert(!"not reached"); 390 return 0; 391 } 392} 393 394 395/* Our support for builtin uniforms is even scarier than non-builtin. 396 * It sits on top of the PROG_STATE_VAR parameters that are 397 * automatically updated from GL context state. 398 */ 399void 400fs_visitor::setup_builtin_uniform_values(ir_variable *ir) 401{ 402 const ir_state_slot *const slots = ir->state_slots; 403 assert(ir->state_slots != NULL); 404 405 for (unsigned int i = 0; i < ir->num_state_slots; i++) { 406 /* This state reference has already been setup by ir_to_mesa, but we'll 407 * get the same index back here. 408 */ 409 int index = _mesa_add_state_reference(this->fp->Base.Parameters, 410 (gl_state_index *)slots[i].tokens); 411 412 /* Add each of the unique swizzles of the element as a parameter. 413 * This'll end up matching the expected layout of the 414 * array/matrix/structure we're trying to fill in. 415 */ 416 int last_swiz = -1; 417 for (unsigned int j = 0; j < 4; j++) { 418 int swiz = GET_SWZ(slots[i].swizzle, j); 419 if (swiz == last_swiz) 420 break; 421 last_swiz = swiz; 422 423 c->prog_data.param_convert[c->prog_data.nr_params] = 424 PARAM_NO_CONVERT; 425 this->param_index[c->prog_data.nr_params] = index; 426 this->param_offset[c->prog_data.nr_params] = swiz; 427 c->prog_data.nr_params++; 428 } 429 } 430} 431 432fs_reg * 433fs_visitor::emit_fragcoord_interpolation(ir_variable *ir) 434{ 435 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 436 fs_reg wpos = *reg; 437 fs_reg neg_y = this->pixel_y; 438 neg_y.negate = true; 439 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo; 440 441 /* gl_FragCoord.x */ 442 if (ir->pixel_center_integer) { 443 emit(BRW_OPCODE_MOV, wpos, this->pixel_x); 444 } else { 445 emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f)); 446 } 447 wpos.reg_offset++; 448 449 /* gl_FragCoord.y */ 450 if (!flip && ir->pixel_center_integer) { 451 emit(BRW_OPCODE_MOV, wpos, this->pixel_y); 452 } else { 453 fs_reg pixel_y = this->pixel_y; 454 float offset = (ir->pixel_center_integer ? 0.0 : 0.5); 455 456 if (flip) { 457 pixel_y.negate = true; 458 offset += c->key.drawable_height - 1.0; 459 } 460 461 emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset)); 462 } 463 wpos.reg_offset++; 464 465 /* gl_FragCoord.z */ 466 if (intel->gen >= 6) { 467 emit(BRW_OPCODE_MOV, wpos, 468 fs_reg(brw_vec8_grf(c->source_depth_reg, 0))); 469 } else { 470 emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y, 471 interp_reg(FRAG_ATTRIB_WPOS, 2)); 472 } 473 wpos.reg_offset++; 474 475 /* gl_FragCoord.w: Already set up in emit_interpolation */ 476 emit(BRW_OPCODE_MOV, wpos, this->wpos_w); 477 478 return reg; 479} 480 481fs_reg * 482fs_visitor::emit_general_interpolation(ir_variable *ir) 483{ 484 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 485 /* Interpolation is always in floating point regs. */ 486 reg->type = BRW_REGISTER_TYPE_F; 487 fs_reg attr = *reg; 488 489 unsigned int array_elements; 490 const glsl_type *type; 491 492 if (ir->type->is_array()) { 493 array_elements = ir->type->length; 494 if (array_elements == 0) { 495 fail("dereferenced array '%s' has length 0\n", ir->name); 496 } 497 type = ir->type->fields.array; 498 } else { 499 array_elements = 1; 500 type = ir->type; 501 } 502 503 int location = ir->location; 504 for (unsigned int i = 0; i < array_elements; i++) { 505 for (unsigned int j = 0; j < type->matrix_columns; j++) { 506 if (urb_setup[location] == -1) { 507 /* If there's no incoming setup data for this slot, don't 508 * emit interpolation for it. 509 */ 510 attr.reg_offset += type->vector_elements; 511 location++; 512 continue; 513 } 514 515 bool is_gl_Color = 516 location == FRAG_ATTRIB_COL0 || location == FRAG_ATTRIB_COL1; 517 518 if (c->key.flat_shade && is_gl_Color) { 519 /* Constant interpolation (flat shading) case. The SF has 520 * handed us defined values in only the constant offset 521 * field of the setup reg. 522 */ 523 for (unsigned int k = 0; k < type->vector_elements; k++) { 524 struct brw_reg interp = interp_reg(location, k); 525 interp = suboffset(interp, 3); 526 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp)); 527 attr.reg_offset++; 528 } 529 } else { 530 /* Perspective interpolation case. */ 531 for (unsigned int k = 0; k < type->vector_elements; k++) { 532 struct brw_reg interp = interp_reg(location, k); 533 emit(FS_OPCODE_LINTERP, attr, 534 this->delta_x, this->delta_y, fs_reg(interp)); 535 attr.reg_offset++; 536 } 537 538 if (intel->gen < 6 && !(is_gl_Color && c->key.linear_color)) { 539 attr.reg_offset -= type->vector_elements; 540 for (unsigned int k = 0; k < type->vector_elements; k++) { 541 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w); 542 attr.reg_offset++; 543 } 544 } 545 } 546 location++; 547 } 548 } 549 550 return reg; 551} 552 553fs_reg * 554fs_visitor::emit_frontfacing_interpolation(ir_variable *ir) 555{ 556 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 557 558 /* The frontfacing comes in as a bit in the thread payload. */ 559 if (intel->gen >= 6) { 560 emit(BRW_OPCODE_ASR, *reg, 561 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)), 562 fs_reg(15)); 563 emit(BRW_OPCODE_NOT, *reg, *reg); 564 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1)); 565 } else { 566 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); 567 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives 568 * us front face 569 */ 570 fs_inst *inst = emit(BRW_OPCODE_CMP, *reg, 571 fs_reg(r1_6ud), 572 fs_reg(1u << 31)); 573 inst->conditional_mod = BRW_CONDITIONAL_L; 574 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u)); 575 } 576 577 return reg; 578} 579 580fs_inst * 581fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src) 582{ 583 switch (opcode) { 584 case FS_OPCODE_RCP: 585 case FS_OPCODE_RSQ: 586 case FS_OPCODE_SQRT: 587 case FS_OPCODE_EXP2: 588 case FS_OPCODE_LOG2: 589 case FS_OPCODE_SIN: 590 case FS_OPCODE_COS: 591 break; 592 default: 593 assert(!"not reached: bad math opcode"); 594 return NULL; 595 } 596 597 /* Can't do hstride == 0 args to gen6 math, so expand it out. We 598 * might be able to do better by doing execsize = 1 math and then 599 * expanding that result out, but we would need to be careful with 600 * masking. 601 * 602 * The hardware ignores source modifiers (negate and abs) on math 603 * instructions, so we also move to a temp to set those up. 604 */ 605 if (intel->gen >= 6 && (src.file == UNIFORM || 606 src.abs || 607 src.negate)) { 608 fs_reg expanded = fs_reg(this, glsl_type::float_type); 609 emit(BRW_OPCODE_MOV, expanded, src); 610 src = expanded; 611 } 612 613 fs_inst *inst = emit(opcode, dst, src); 614 615 if (intel->gen < 6) { 616 inst->base_mrf = 2; 617 inst->mlen = 1; 618 } 619 620 return inst; 621} 622 623fs_inst * 624fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1) 625{ 626 int base_mrf = 2; 627 fs_inst *inst; 628 629 assert(opcode == FS_OPCODE_POW); 630 631 if (intel->gen >= 6) { 632 /* Can't do hstride == 0 args to gen6 math, so expand it out. 633 * 634 * The hardware ignores source modifiers (negate and abs) on math 635 * instructions, so we also move to a temp to set those up. 636 */ 637 if (src0.file == UNIFORM || src0.abs || src0.negate) { 638 fs_reg expanded = fs_reg(this, glsl_type::float_type); 639 emit(BRW_OPCODE_MOV, expanded, src0); 640 src0 = expanded; 641 } 642 643 if (src1.file == UNIFORM || src1.abs || src1.negate) { 644 fs_reg expanded = fs_reg(this, glsl_type::float_type); 645 emit(BRW_OPCODE_MOV, expanded, src1); 646 src1 = expanded; 647 } 648 649 inst = emit(opcode, dst, src0, src1); 650 } else { 651 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1); 652 inst = emit(opcode, dst, src0, reg_null_f); 653 654 inst->base_mrf = base_mrf; 655 inst->mlen = 2; 656 } 657 return inst; 658} 659 660void 661fs_visitor::visit(ir_variable *ir) 662{ 663 fs_reg *reg = NULL; 664 665 if (variable_storage(ir)) 666 return; 667 668 if (strcmp(ir->name, "gl_FragColor") == 0) { 669 this->frag_color = ir; 670 } else if (strcmp(ir->name, "gl_FragData") == 0) { 671 this->frag_data = ir; 672 } else if (strcmp(ir->name, "gl_FragDepth") == 0) { 673 this->frag_depth = ir; 674 } 675 676 if (ir->mode == ir_var_in) { 677 if (!strcmp(ir->name, "gl_FragCoord")) { 678 reg = emit_fragcoord_interpolation(ir); 679 } else if (!strcmp(ir->name, "gl_FrontFacing")) { 680 reg = emit_frontfacing_interpolation(ir); 681 } else { 682 reg = emit_general_interpolation(ir); 683 } 684 assert(reg); 685 hash_table_insert(this->variable_ht, reg, ir); 686 return; 687 } 688 689 if (ir->mode == ir_var_uniform) { 690 int param_index = c->prog_data.nr_params; 691 692 if (!strncmp(ir->name, "gl_", 3)) { 693 setup_builtin_uniform_values(ir); 694 } else { 695 setup_uniform_values(ir->location, ir->type); 696 } 697 698 reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index); 699 reg->type = brw_type_for_base_type(ir->type); 700 } 701 702 if (!reg) 703 reg = new(this->mem_ctx) fs_reg(this, ir->type); 704 705 hash_table_insert(this->variable_ht, reg, ir); 706} 707 708void 709fs_visitor::visit(ir_dereference_variable *ir) 710{ 711 fs_reg *reg = variable_storage(ir->var); 712 this->result = *reg; 713} 714 715void 716fs_visitor::visit(ir_dereference_record *ir) 717{ 718 const glsl_type *struct_type = ir->record->type; 719 720 ir->record->accept(this); 721 722 unsigned int offset = 0; 723 for (unsigned int i = 0; i < struct_type->length; i++) { 724 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0) 725 break; 726 offset += type_size(struct_type->fields.structure[i].type); 727 } 728 this->result.reg_offset += offset; 729 this->result.type = brw_type_for_base_type(ir->type); 730} 731 732void 733fs_visitor::visit(ir_dereference_array *ir) 734{ 735 ir_constant *index; 736 int element_size; 737 738 ir->array->accept(this); 739 index = ir->array_index->as_constant(); 740 741 element_size = type_size(ir->type); 742 this->result.type = brw_type_for_base_type(ir->type); 743 744 if (index) { 745 assert(this->result.file == UNIFORM || 746 (this->result.file == GRF && 747 this->result.reg != 0)); 748 this->result.reg_offset += index->value.i[0] * element_size; 749 } else { 750 assert(!"FINISHME: non-constant array element"); 751 } 752} 753 754/* Instruction selection: Produce a MOV.sat instead of 755 * MIN(MAX(val, 0), 1) when possible. 756 */ 757bool 758fs_visitor::try_emit_saturate(ir_expression *ir) 759{ 760 ir_rvalue *sat_val = ir->as_rvalue_to_saturate(); 761 762 if (!sat_val) 763 return false; 764 765 sat_val->accept(this); 766 fs_reg src = this->result; 767 768 this->result = fs_reg(this, ir->type); 769 fs_inst *inst = emit(BRW_OPCODE_MOV, this->result, src); 770 inst->saturate = true; 771 772 return true; 773} 774 775static uint32_t 776brw_conditional_for_comparison(unsigned int op) 777{ 778 switch (op) { 779 case ir_binop_less: 780 return BRW_CONDITIONAL_L; 781 case ir_binop_greater: 782 return BRW_CONDITIONAL_G; 783 case ir_binop_lequal: 784 return BRW_CONDITIONAL_LE; 785 case ir_binop_gequal: 786 return BRW_CONDITIONAL_GE; 787 case ir_binop_equal: 788 case ir_binop_all_equal: /* same as equal for scalars */ 789 return BRW_CONDITIONAL_Z; 790 case ir_binop_nequal: 791 case ir_binop_any_nequal: /* same as nequal for scalars */ 792 return BRW_CONDITIONAL_NZ; 793 default: 794 assert(!"not reached: bad operation for comparison"); 795 return BRW_CONDITIONAL_NZ; 796 } 797} 798 799void 800fs_visitor::visit(ir_expression *ir) 801{ 802 unsigned int operand; 803 fs_reg op[2], temp; 804 fs_inst *inst; 805 806 assert(ir->get_num_operands() <= 2); 807 808 if (try_emit_saturate(ir)) 809 return; 810 811 for (operand = 0; operand < ir->get_num_operands(); operand++) { 812 ir->operands[operand]->accept(this); 813 if (this->result.file == BAD_FILE) { 814 ir_print_visitor v; 815 fail("Failed to get tree for expression operand:\n"); 816 ir->operands[operand]->accept(&v); 817 } 818 op[operand] = this->result; 819 820 /* Matrix expression operands should have been broken down to vector 821 * operations already. 822 */ 823 assert(!ir->operands[operand]->type->is_matrix()); 824 /* And then those vector operands should have been broken down to scalar. 825 */ 826 assert(!ir->operands[operand]->type->is_vector()); 827 } 828 829 /* Storage for our result. If our result goes into an assignment, it will 830 * just get copy-propagated out, so no worries. 831 */ 832 this->result = fs_reg(this, ir->type); 833 834 switch (ir->operation) { 835 case ir_unop_logic_not: 836 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is 837 * ones complement of the whole register, not just bit 0. 838 */ 839 emit(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1)); 840 break; 841 case ir_unop_neg: 842 op[0].negate = !op[0].negate; 843 this->result = op[0]; 844 break; 845 case ir_unop_abs: 846 op[0].abs = true; 847 op[0].negate = false; 848 this->result = op[0]; 849 break; 850 case ir_unop_sign: 851 temp = fs_reg(this, ir->type); 852 853 emit(BRW_OPCODE_MOV, this->result, fs_reg(0.0f)); 854 855 inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f)); 856 inst->conditional_mod = BRW_CONDITIONAL_G; 857 inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(1.0f)); 858 inst->predicated = true; 859 860 inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f)); 861 inst->conditional_mod = BRW_CONDITIONAL_L; 862 inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f)); 863 inst->predicated = true; 864 865 break; 866 case ir_unop_rcp: 867 emit_math(FS_OPCODE_RCP, this->result, op[0]); 868 break; 869 870 case ir_unop_exp2: 871 emit_math(FS_OPCODE_EXP2, this->result, op[0]); 872 break; 873 case ir_unop_log2: 874 emit_math(FS_OPCODE_LOG2, this->result, op[0]); 875 break; 876 case ir_unop_exp: 877 case ir_unop_log: 878 assert(!"not reached: should be handled by ir_explog_to_explog2"); 879 break; 880 case ir_unop_sin: 881 case ir_unop_sin_reduced: 882 emit_math(FS_OPCODE_SIN, this->result, op[0]); 883 break; 884 case ir_unop_cos: 885 case ir_unop_cos_reduced: 886 emit_math(FS_OPCODE_COS, this->result, op[0]); 887 break; 888 889 case ir_unop_dFdx: 890 emit(FS_OPCODE_DDX, this->result, op[0]); 891 break; 892 case ir_unop_dFdy: 893 emit(FS_OPCODE_DDY, this->result, op[0]); 894 break; 895 896 case ir_binop_add: 897 emit(BRW_OPCODE_ADD, this->result, op[0], op[1]); 898 break; 899 case ir_binop_sub: 900 assert(!"not reached: should be handled by ir_sub_to_add_neg"); 901 break; 902 903 case ir_binop_mul: 904 emit(BRW_OPCODE_MUL, this->result, op[0], op[1]); 905 break; 906 case ir_binop_div: 907 assert(!"not reached: should be handled by ir_div_to_mul_rcp"); 908 break; 909 case ir_binop_mod: 910 assert(!"ir_binop_mod should have been converted to b * fract(a/b)"); 911 break; 912 913 case ir_binop_less: 914 case ir_binop_greater: 915 case ir_binop_lequal: 916 case ir_binop_gequal: 917 case ir_binop_equal: 918 case ir_binop_all_equal: 919 case ir_binop_nequal: 920 case ir_binop_any_nequal: 921 temp = this->result; 922 /* original gen4 does implicit conversion before comparison. */ 923 if (intel->gen < 5) 924 temp.type = op[0].type; 925 926 inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]); 927 inst->conditional_mod = brw_conditional_for_comparison(ir->operation); 928 emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)); 929 break; 930 931 case ir_binop_logic_xor: 932 emit(BRW_OPCODE_XOR, this->result, op[0], op[1]); 933 break; 934 935 case ir_binop_logic_or: 936 emit(BRW_OPCODE_OR, this->result, op[0], op[1]); 937 break; 938 939 case ir_binop_logic_and: 940 emit(BRW_OPCODE_AND, this->result, op[0], op[1]); 941 break; 942 943 case ir_binop_dot: 944 case ir_unop_any: 945 assert(!"not reached: should be handled by brw_fs_channel_expressions"); 946 break; 947 948 case ir_unop_noise: 949 assert(!"not reached: should be handled by lower_noise"); 950 break; 951 952 case ir_quadop_vector: 953 assert(!"not reached: should be handled by lower_quadop_vector"); 954 break; 955 956 case ir_unop_sqrt: 957 emit_math(FS_OPCODE_SQRT, this->result, op[0]); 958 break; 959 960 case ir_unop_rsq: 961 emit_math(FS_OPCODE_RSQ, this->result, op[0]); 962 break; 963 964 case ir_unop_i2f: 965 case ir_unop_b2f: 966 case ir_unop_b2i: 967 case ir_unop_f2i: 968 emit(BRW_OPCODE_MOV, this->result, op[0]); 969 break; 970 case ir_unop_f2b: 971 case ir_unop_i2b: 972 temp = this->result; 973 /* original gen4 does implicit conversion before comparison. */ 974 if (intel->gen < 5) 975 temp.type = op[0].type; 976 977 inst = emit(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f)); 978 inst->conditional_mod = BRW_CONDITIONAL_NZ; 979 inst = emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1)); 980 break; 981 982 case ir_unop_trunc: 983 emit(BRW_OPCODE_RNDZ, this->result, op[0]); 984 break; 985 case ir_unop_ceil: 986 op[0].negate = !op[0].negate; 987 inst = emit(BRW_OPCODE_RNDD, this->result, op[0]); 988 this->result.negate = true; 989 break; 990 case ir_unop_floor: 991 inst = emit(BRW_OPCODE_RNDD, this->result, op[0]); 992 break; 993 case ir_unop_fract: 994 inst = emit(BRW_OPCODE_FRC, this->result, op[0]); 995 break; 996 case ir_unop_round_even: 997 emit(BRW_OPCODE_RNDE, this->result, op[0]); 998 break; 999 1000 case ir_binop_min: 1001 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]); 1002 inst->conditional_mod = BRW_CONDITIONAL_L; 1003 1004 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]); 1005 inst->predicated = true; 1006 break; 1007 case ir_binop_max: 1008 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]); 1009 inst->conditional_mod = BRW_CONDITIONAL_G; 1010 1011 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]); 1012 inst->predicated = true; 1013 break; 1014 1015 case ir_binop_pow: 1016 emit_math(FS_OPCODE_POW, this->result, op[0], op[1]); 1017 break; 1018 1019 case ir_unop_bit_not: 1020 inst = emit(BRW_OPCODE_NOT, this->result, op[0]); 1021 break; 1022 case ir_binop_bit_and: 1023 inst = emit(BRW_OPCODE_AND, this->result, op[0], op[1]); 1024 break; 1025 case ir_binop_bit_xor: 1026 inst = emit(BRW_OPCODE_XOR, this->result, op[0], op[1]); 1027 break; 1028 case ir_binop_bit_or: 1029 inst = emit(BRW_OPCODE_OR, this->result, op[0], op[1]); 1030 break; 1031 1032 case ir_unop_u2f: 1033 case ir_binop_lshift: 1034 case ir_binop_rshift: 1035 assert(!"GLSL 1.30 features unsupported"); 1036 break; 1037 } 1038} 1039 1040void 1041fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r, 1042 const glsl_type *type, bool predicated) 1043{ 1044 switch (type->base_type) { 1045 case GLSL_TYPE_FLOAT: 1046 case GLSL_TYPE_UINT: 1047 case GLSL_TYPE_INT: 1048 case GLSL_TYPE_BOOL: 1049 for (unsigned int i = 0; i < type->components(); i++) { 1050 l.type = brw_type_for_base_type(type); 1051 r.type = brw_type_for_base_type(type); 1052 1053 fs_inst *inst = emit(BRW_OPCODE_MOV, l, r); 1054 inst->predicated = predicated; 1055 1056 l.reg_offset++; 1057 r.reg_offset++; 1058 } 1059 break; 1060 case GLSL_TYPE_ARRAY: 1061 for (unsigned int i = 0; i < type->length; i++) { 1062 emit_assignment_writes(l, r, type->fields.array, predicated); 1063 } 1064 break; 1065 1066 case GLSL_TYPE_STRUCT: 1067 for (unsigned int i = 0; i < type->length; i++) { 1068 emit_assignment_writes(l, r, type->fields.structure[i].type, 1069 predicated); 1070 } 1071 break; 1072 1073 case GLSL_TYPE_SAMPLER: 1074 break; 1075 1076 default: 1077 assert(!"not reached"); 1078 break; 1079 } 1080} 1081 1082void 1083fs_visitor::visit(ir_assignment *ir) 1084{ 1085 struct fs_reg l, r; 1086 fs_inst *inst; 1087 1088 /* FINISHME: arrays on the lhs */ 1089 ir->lhs->accept(this); 1090 l = this->result; 1091 1092 ir->rhs->accept(this); 1093 r = this->result; 1094 1095 assert(l.file != BAD_FILE); 1096 assert(r.file != BAD_FILE); 1097 1098 if (ir->condition) { 1099 emit_bool_to_cond_code(ir->condition); 1100 } 1101 1102 if (ir->lhs->type->is_scalar() || 1103 ir->lhs->type->is_vector()) { 1104 for (int i = 0; i < ir->lhs->type->vector_elements; i++) { 1105 if (ir->write_mask & (1 << i)) { 1106 inst = emit(BRW_OPCODE_MOV, l, r); 1107 if (ir->condition) 1108 inst->predicated = true; 1109 r.reg_offset++; 1110 } 1111 l.reg_offset++; 1112 } 1113 } else { 1114 emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL); 1115 } 1116} 1117 1118fs_inst * 1119fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate) 1120{ 1121 int mlen; 1122 int base_mrf = 1; 1123 bool simd16 = false; 1124 fs_reg orig_dst; 1125 1126 /* g0 header. */ 1127 mlen = 1; 1128 1129 if (ir->shadow_comparitor) { 1130 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1131 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate); 1132 coordinate.reg_offset++; 1133 } 1134 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */ 1135 mlen += 3; 1136 1137 if (ir->op == ir_tex) { 1138 /* There's no plain shadow compare message, so we use shadow 1139 * compare with a bias of 0.0. 1140 */ 1141 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)); 1142 mlen++; 1143 } else if (ir->op == ir_txb) { 1144 ir->lod_info.bias->accept(this); 1145 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1146 mlen++; 1147 } else { 1148 assert(ir->op == ir_txl); 1149 ir->lod_info.lod->accept(this); 1150 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1151 mlen++; 1152 } 1153 1154 ir->shadow_comparitor->accept(this); 1155 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1156 mlen++; 1157 } else if (ir->op == ir_tex) { 1158 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1159 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate); 1160 coordinate.reg_offset++; 1161 } 1162 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */ 1163 mlen += 3; 1164 } else if (ir->op == ir_txd) { 1165 assert(!"TXD isn't supported on gen4 yet."); 1166 } else { 1167 /* Oh joy. gen4 doesn't have SIMD8 non-shadow-compare bias/lod 1168 * instructions. We'll need to do SIMD16 here. 1169 */ 1170 assert(ir->op == ir_txb || ir->op == ir_txl); 1171 1172 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1173 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2), coordinate); 1174 coordinate.reg_offset++; 1175 } 1176 1177 /* lod/bias appears after u/v/r. */ 1178 mlen += 6; 1179 1180 if (ir->op == ir_txb) { 1181 ir->lod_info.bias->accept(this); 1182 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1183 mlen++; 1184 } else { 1185 ir->lod_info.lod->accept(this); 1186 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1187 mlen++; 1188 } 1189 1190 /* The unused upper half. */ 1191 mlen++; 1192 1193 /* Now, since we're doing simd16, the return is 2 interleaved 1194 * vec4s where the odd-indexed ones are junk. We'll need to move 1195 * this weirdness around to the expected layout. 1196 */ 1197 simd16 = true; 1198 orig_dst = dst; 1199 dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 1200 2)); 1201 dst.type = BRW_REGISTER_TYPE_F; 1202 } 1203 1204 fs_inst *inst = NULL; 1205 switch (ir->op) { 1206 case ir_tex: 1207 inst = emit(FS_OPCODE_TEX, dst); 1208 break; 1209 case ir_txb: 1210 inst = emit(FS_OPCODE_TXB, dst); 1211 break; 1212 case ir_txl: 1213 inst = emit(FS_OPCODE_TXL, dst); 1214 break; 1215 case ir_txd: 1216 inst = emit(FS_OPCODE_TXD, dst); 1217 break; 1218 case ir_txf: 1219 assert(!"GLSL 1.30 features unsupported"); 1220 break; 1221 } 1222 inst->base_mrf = base_mrf; 1223 inst->mlen = mlen; 1224 1225 if (simd16) { 1226 for (int i = 0; i < 4; i++) { 1227 emit(BRW_OPCODE_MOV, orig_dst, dst); 1228 orig_dst.reg_offset++; 1229 dst.reg_offset += 2; 1230 } 1231 } 1232 1233 return inst; 1234} 1235 1236fs_inst * 1237fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate) 1238{ 1239 /* gen5's SIMD8 sampler has slots for u, v, r, array index, then 1240 * optional parameters like shadow comparitor or LOD bias. If 1241 * optional parameters aren't present, those base slots are 1242 * optional and don't need to be included in the message. 1243 * 1244 * We don't fill in the unnecessary slots regardless, which may 1245 * look surprising in the disassembly. 1246 */ 1247 int mlen = 1; /* g0 header always present. */ 1248 int base_mrf = 1; 1249 1250 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1251 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate); 1252 coordinate.reg_offset++; 1253 } 1254 mlen += ir->coordinate->type->vector_elements; 1255 1256 if (ir->shadow_comparitor) { 1257 mlen = MAX2(mlen, 5); 1258 1259 ir->shadow_comparitor->accept(this); 1260 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1261 mlen++; 1262 } 1263 1264 fs_inst *inst = NULL; 1265 switch (ir->op) { 1266 case ir_tex: 1267 inst = emit(FS_OPCODE_TEX, dst); 1268 break; 1269 case ir_txb: 1270 ir->lod_info.bias->accept(this); 1271 mlen = MAX2(mlen, 5); 1272 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1273 mlen++; 1274 1275 inst = emit(FS_OPCODE_TXB, dst); 1276 break; 1277 case ir_txl: 1278 ir->lod_info.lod->accept(this); 1279 mlen = MAX2(mlen, 5); 1280 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1281 mlen++; 1282 1283 inst = emit(FS_OPCODE_TXL, dst); 1284 break; 1285 case ir_txd: 1286 case ir_txf: 1287 assert(!"GLSL 1.30 features unsupported"); 1288 break; 1289 } 1290 inst->base_mrf = base_mrf; 1291 inst->mlen = mlen; 1292 1293 return inst; 1294} 1295 1296void 1297fs_visitor::visit(ir_texture *ir) 1298{ 1299 int sampler; 1300 fs_inst *inst = NULL; 1301 1302 ir->coordinate->accept(this); 1303 fs_reg coordinate = this->result; 1304 1305 if (ir->offset != NULL) { 1306 ir_constant *offset = ir->offset->as_constant(); 1307 assert(offset != NULL); 1308 1309 signed char offsets[3]; 1310 for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) 1311 offsets[i] = (signed char) offset->value.i[i]; 1312 1313 /* Combine all three offsets into a single unsigned dword: 1314 * 1315 * bits 11:8 - U Offset (X component) 1316 * bits 7:4 - V Offset (Y component) 1317 * bits 3:0 - R Offset (Z component) 1318 */ 1319 unsigned offset_bits = 0; 1320 for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) { 1321 const unsigned shift = 4 * (2 - i); 1322 offset_bits |= (offsets[i] << shift) & (0xF << shift); 1323 } 1324 1325 /* Explicitly set up the message header by copying g0 to msg reg m1. */ 1326 emit(BRW_OPCODE_MOV, fs_reg(MRF, 1, BRW_REGISTER_TYPE_UD), 1327 fs_reg(GRF, 0, BRW_REGISTER_TYPE_UD)); 1328 1329 /* Then set the offset bits in DWord 2 of the message header. */ 1330 emit(BRW_OPCODE_MOV, 1331 fs_reg(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 1, 2), 1332 BRW_REGISTER_TYPE_UD)), 1333 fs_reg(brw_imm_uw(offset_bits))); 1334 } 1335 1336 /* Should be lowered by do_lower_texture_projection */ 1337 assert(!ir->projector); 1338 1339 sampler = _mesa_get_sampler_uniform_value(ir->sampler, 1340 ctx->Shader.CurrentFragmentProgram, 1341 &brw->fragment_program->Base); 1342 sampler = c->fp->program.Base.SamplerUnits[sampler]; 1343 1344 /* The 965 requires the EU to do the normalization of GL rectangle 1345 * texture coordinates. We use the program parameter state 1346 * tracking to get the scaling factor. 1347 */ 1348 if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) { 1349 struct gl_program_parameter_list *params = c->fp->program.Base.Parameters; 1350 int tokens[STATE_LENGTH] = { 1351 STATE_INTERNAL, 1352 STATE_TEXRECT_SCALE, 1353 sampler, 1354 0, 1355 0 1356 }; 1357 1358 c->prog_data.param_convert[c->prog_data.nr_params] = 1359 PARAM_NO_CONVERT; 1360 c->prog_data.param_convert[c->prog_data.nr_params + 1] = 1361 PARAM_NO_CONVERT; 1362 1363 fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params); 1364 fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1); 1365 GLuint index = _mesa_add_state_reference(params, 1366 (gl_state_index *)tokens); 1367 1368 this->param_index[c->prog_data.nr_params] = index; 1369 this->param_offset[c->prog_data.nr_params] = 0; 1370 c->prog_data.nr_params++; 1371 this->param_index[c->prog_data.nr_params] = index; 1372 this->param_offset[c->prog_data.nr_params] = 1; 1373 c->prog_data.nr_params++; 1374 1375 fs_reg dst = fs_reg(this, ir->coordinate->type); 1376 fs_reg src = coordinate; 1377 coordinate = dst; 1378 1379 emit(BRW_OPCODE_MUL, dst, src, scale_x); 1380 dst.reg_offset++; 1381 src.reg_offset++; 1382 emit(BRW_OPCODE_MUL, dst, src, scale_y); 1383 } 1384 1385 /* Writemasking doesn't eliminate channels on SIMD8 texture 1386 * samples, so don't worry about them. 1387 */ 1388 fs_reg dst = fs_reg(this, glsl_type::vec4_type); 1389 1390 if (intel->gen < 5) { 1391 inst = emit_texture_gen4(ir, dst, coordinate); 1392 } else { 1393 inst = emit_texture_gen5(ir, dst, coordinate); 1394 } 1395 1396 /* If there's an offset, we already set up m1. To avoid the implied move, 1397 * use the null register. Otherwise, we want an implied move from g0. 1398 */ 1399 if (ir->offset != NULL) 1400 inst->src[0] = fs_reg(brw_null_reg()); 1401 else 1402 inst->src[0] = fs_reg(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW)); 1403 1404 inst->sampler = sampler; 1405 1406 this->result = dst; 1407 1408 if (ir->shadow_comparitor) 1409 inst->shadow_compare = true; 1410 1411 if (ir->type == glsl_type::float_type) { 1412 /* Ignore DEPTH_TEXTURE_MODE swizzling. */ 1413 assert(ir->sampler->type->sampler_shadow); 1414 } else if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) { 1415 fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type); 1416 1417 for (int i = 0; i < 4; i++) { 1418 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i); 1419 fs_reg l = swizzle_dst; 1420 l.reg_offset += i; 1421 1422 if (swiz == SWIZZLE_ZERO) { 1423 emit(BRW_OPCODE_MOV, l, fs_reg(0.0f)); 1424 } else if (swiz == SWIZZLE_ONE) { 1425 emit(BRW_OPCODE_MOV, l, fs_reg(1.0f)); 1426 } else { 1427 fs_reg r = dst; 1428 r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i); 1429 emit(BRW_OPCODE_MOV, l, r); 1430 } 1431 } 1432 this->result = swizzle_dst; 1433 } 1434} 1435 1436void 1437fs_visitor::visit(ir_swizzle *ir) 1438{ 1439 ir->val->accept(this); 1440 fs_reg val = this->result; 1441 1442 if (ir->type->vector_elements == 1) { 1443 this->result.reg_offset += ir->mask.x; 1444 return; 1445 } 1446 1447 fs_reg result = fs_reg(this, ir->type); 1448 this->result = result; 1449 1450 for (unsigned int i = 0; i < ir->type->vector_elements; i++) { 1451 fs_reg channel = val; 1452 int swiz = 0; 1453 1454 switch (i) { 1455 case 0: 1456 swiz = ir->mask.x; 1457 break; 1458 case 1: 1459 swiz = ir->mask.y; 1460 break; 1461 case 2: 1462 swiz = ir->mask.z; 1463 break; 1464 case 3: 1465 swiz = ir->mask.w; 1466 break; 1467 } 1468 1469 channel.reg_offset += swiz; 1470 emit(BRW_OPCODE_MOV, result, channel); 1471 result.reg_offset++; 1472 } 1473} 1474 1475void 1476fs_visitor::visit(ir_discard *ir) 1477{ 1478 fs_reg temp = fs_reg(this, glsl_type::uint_type); 1479 1480 assert(ir->condition == NULL); /* FINISHME */ 1481 1482 emit(FS_OPCODE_DISCARD_NOT, temp, reg_null_d); 1483 emit(FS_OPCODE_DISCARD_AND, reg_null_d, temp); 1484 kill_emitted = true; 1485} 1486 1487void 1488fs_visitor::visit(ir_constant *ir) 1489{ 1490 /* Set this->result to reg at the bottom of the function because some code 1491 * paths will cause this visitor to be applied to other fields. This will 1492 * cause the value stored in this->result to be modified. 1493 * 1494 * Make reg constant so that it doesn't get accidentally modified along the 1495 * way. Yes, I actually had this problem. :( 1496 */ 1497 const fs_reg reg(this, ir->type); 1498 fs_reg dst_reg = reg; 1499 1500 if (ir->type->is_array()) { 1501 const unsigned size = type_size(ir->type->fields.array); 1502 1503 for (unsigned i = 0; i < ir->type->length; i++) { 1504 ir->array_elements[i]->accept(this); 1505 fs_reg src_reg = this->result; 1506 1507 dst_reg.type = src_reg.type; 1508 for (unsigned j = 0; j < size; j++) { 1509 emit(BRW_OPCODE_MOV, dst_reg, src_reg); 1510 src_reg.reg_offset++; 1511 dst_reg.reg_offset++; 1512 } 1513 } 1514 } else if (ir->type->is_record()) { 1515 foreach_list(node, &ir->components) { 1516 ir_instruction *const field = (ir_instruction *) node; 1517 const unsigned size = type_size(field->type); 1518 1519 field->accept(this); 1520 fs_reg src_reg = this->result; 1521 1522 dst_reg.type = src_reg.type; 1523 for (unsigned j = 0; j < size; j++) { 1524 emit(BRW_OPCODE_MOV, dst_reg, src_reg); 1525 src_reg.reg_offset++; 1526 dst_reg.reg_offset++; 1527 } 1528 } 1529 } else { 1530 const unsigned size = type_size(ir->type); 1531 1532 for (unsigned i = 0; i < size; i++) { 1533 switch (ir->type->base_type) { 1534 case GLSL_TYPE_FLOAT: 1535 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i])); 1536 break; 1537 case GLSL_TYPE_UINT: 1538 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i])); 1539 break; 1540 case GLSL_TYPE_INT: 1541 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i])); 1542 break; 1543 case GLSL_TYPE_BOOL: 1544 emit(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i])); 1545 break; 1546 default: 1547 assert(!"Non-float/uint/int/bool constant"); 1548 } 1549 dst_reg.reg_offset++; 1550 } 1551 } 1552 1553 this->result = reg; 1554} 1555 1556void 1557fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir) 1558{ 1559 ir_expression *expr = ir->as_expression(); 1560 1561 if (expr) { 1562 fs_reg op[2]; 1563 fs_inst *inst; 1564 1565 assert(expr->get_num_operands() <= 2); 1566 for (unsigned int i = 0; i < expr->get_num_operands(); i++) { 1567 assert(expr->operands[i]->type->is_scalar()); 1568 1569 expr->operands[i]->accept(this); 1570 op[i] = this->result; 1571 } 1572 1573 switch (expr->operation) { 1574 case ir_unop_logic_not: 1575 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1)); 1576 inst->conditional_mod = BRW_CONDITIONAL_Z; 1577 break; 1578 1579 case ir_binop_logic_xor: 1580 inst = emit(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]); 1581 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1582 break; 1583 1584 case ir_binop_logic_or: 1585 inst = emit(BRW_OPCODE_OR, reg_null_d, op[0], op[1]); 1586 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1587 break; 1588 1589 case ir_binop_logic_and: 1590 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], op[1]); 1591 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1592 break; 1593 1594 case ir_unop_f2b: 1595 if (intel->gen >= 6) { 1596 inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f)); 1597 } else { 1598 inst = emit(BRW_OPCODE_MOV, reg_null_f, op[0]); 1599 } 1600 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1601 break; 1602 1603 case ir_unop_i2b: 1604 if (intel->gen >= 6) { 1605 inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0)); 1606 } else { 1607 inst = emit(BRW_OPCODE_MOV, reg_null_d, op[0]); 1608 } 1609 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1610 break; 1611 1612 case ir_binop_greater: 1613 case ir_binop_gequal: 1614 case ir_binop_less: 1615 case ir_binop_lequal: 1616 case ir_binop_equal: 1617 case ir_binop_all_equal: 1618 case ir_binop_nequal: 1619 case ir_binop_any_nequal: 1620 inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]); 1621 inst->conditional_mod = 1622 brw_conditional_for_comparison(expr->operation); 1623 break; 1624 1625 default: 1626 assert(!"not reached"); 1627 fail("bad cond code\n"); 1628 break; 1629 } 1630 return; 1631 } 1632 1633 ir->accept(this); 1634 1635 if (intel->gen >= 6) { 1636 fs_inst *inst = emit(BRW_OPCODE_AND, reg_null_d, this->result, fs_reg(1)); 1637 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1638 } else { 1639 fs_inst *inst = emit(BRW_OPCODE_MOV, reg_null_d, this->result); 1640 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1641 } 1642} 1643 1644/** 1645 * Emit a gen6 IF statement with the comparison folded into the IF 1646 * instruction. 1647 */ 1648void 1649fs_visitor::emit_if_gen6(ir_if *ir) 1650{ 1651 ir_expression *expr = ir->condition->as_expression(); 1652 1653 if (expr) { 1654 fs_reg op[2]; 1655 fs_inst *inst; 1656 fs_reg temp; 1657 1658 assert(expr->get_num_operands() <= 2); 1659 for (unsigned int i = 0; i < expr->get_num_operands(); i++) { 1660 assert(expr->operands[i]->type->is_scalar()); 1661 1662 expr->operands[i]->accept(this); 1663 op[i] = this->result; 1664 } 1665 1666 switch (expr->operation) { 1667 case ir_unop_logic_not: 1668 inst = emit(BRW_OPCODE_IF, temp, op[0], fs_reg(0)); 1669 inst->conditional_mod = BRW_CONDITIONAL_Z; 1670 return; 1671 1672 case ir_binop_logic_xor: 1673 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]); 1674 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1675 return; 1676 1677 case ir_binop_logic_or: 1678 temp = fs_reg(this, glsl_type::bool_type); 1679 emit(BRW_OPCODE_OR, temp, op[0], op[1]); 1680 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0)); 1681 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1682 return; 1683 1684 case ir_binop_logic_and: 1685 temp = fs_reg(this, glsl_type::bool_type); 1686 emit(BRW_OPCODE_AND, temp, op[0], op[1]); 1687 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0)); 1688 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1689 return; 1690 1691 case ir_unop_f2b: 1692 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0)); 1693 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1694 return; 1695 1696 case ir_unop_i2b: 1697 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0)); 1698 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1699 return; 1700 1701 case ir_binop_greater: 1702 case ir_binop_gequal: 1703 case ir_binop_less: 1704 case ir_binop_lequal: 1705 case ir_binop_equal: 1706 case ir_binop_all_equal: 1707 case ir_binop_nequal: 1708 case ir_binop_any_nequal: 1709 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]); 1710 inst->conditional_mod = 1711 brw_conditional_for_comparison(expr->operation); 1712 return; 1713 default: 1714 assert(!"not reached"); 1715 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0)); 1716 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1717 fail("bad condition\n"); 1718 return; 1719 } 1720 return; 1721 } 1722 1723 ir->condition->accept(this); 1724 1725 fs_inst *inst = emit(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0)); 1726 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1727} 1728 1729void 1730fs_visitor::visit(ir_if *ir) 1731{ 1732 fs_inst *inst; 1733 1734 /* Don't point the annotation at the if statement, because then it plus 1735 * the then and else blocks get printed. 1736 */ 1737 this->base_ir = ir->condition; 1738 1739 if (intel->gen >= 6) { 1740 emit_if_gen6(ir); 1741 } else { 1742 emit_bool_to_cond_code(ir->condition); 1743 1744 inst = emit(BRW_OPCODE_IF); 1745 inst->predicated = true; 1746 } 1747 1748 foreach_iter(exec_list_iterator, iter, ir->then_instructions) { 1749 ir_instruction *ir = (ir_instruction *)iter.get(); 1750 this->base_ir = ir; 1751 1752 ir->accept(this); 1753 } 1754 1755 if (!ir->else_instructions.is_empty()) { 1756 emit(BRW_OPCODE_ELSE); 1757 1758 foreach_iter(exec_list_iterator, iter, ir->else_instructions) { 1759 ir_instruction *ir = (ir_instruction *)iter.get(); 1760 this->base_ir = ir; 1761 1762 ir->accept(this); 1763 } 1764 } 1765 1766 emit(BRW_OPCODE_ENDIF); 1767} 1768 1769void 1770fs_visitor::visit(ir_loop *ir) 1771{ 1772 fs_reg counter = reg_undef; 1773 1774 if (ir->counter) { 1775 this->base_ir = ir->counter; 1776 ir->counter->accept(this); 1777 counter = *(variable_storage(ir->counter)); 1778 1779 if (ir->from) { 1780 this->base_ir = ir->from; 1781 ir->from->accept(this); 1782 1783 emit(BRW_OPCODE_MOV, counter, this->result); 1784 } 1785 } 1786 1787 emit(BRW_OPCODE_DO); 1788 1789 if (ir->to) { 1790 this->base_ir = ir->to; 1791 ir->to->accept(this); 1792 1793 fs_inst *inst = emit(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result); 1794 inst->conditional_mod = brw_conditional_for_comparison(ir->cmp); 1795 1796 inst = emit(BRW_OPCODE_BREAK); 1797 inst->predicated = true; 1798 } 1799 1800 foreach_iter(exec_list_iterator, iter, ir->body_instructions) { 1801 ir_instruction *ir = (ir_instruction *)iter.get(); 1802 1803 this->base_ir = ir; 1804 ir->accept(this); 1805 } 1806 1807 if (ir->increment) { 1808 this->base_ir = ir->increment; 1809 ir->increment->accept(this); 1810 emit(BRW_OPCODE_ADD, counter, counter, this->result); 1811 } 1812 1813 emit(BRW_OPCODE_WHILE); 1814} 1815 1816void 1817fs_visitor::visit(ir_loop_jump *ir) 1818{ 1819 switch (ir->mode) { 1820 case ir_loop_jump::jump_break: 1821 emit(BRW_OPCODE_BREAK); 1822 break; 1823 case ir_loop_jump::jump_continue: 1824 emit(BRW_OPCODE_CONTINUE); 1825 break; 1826 } 1827} 1828 1829void 1830fs_visitor::visit(ir_call *ir) 1831{ 1832 assert(!"FINISHME"); 1833} 1834 1835void 1836fs_visitor::visit(ir_return *ir) 1837{ 1838 assert(!"FINISHME"); 1839} 1840 1841void 1842fs_visitor::visit(ir_function *ir) 1843{ 1844 /* Ignore function bodies other than main() -- we shouldn't see calls to 1845 * them since they should all be inlined before we get to ir_to_mesa. 1846 */ 1847 if (strcmp(ir->name, "main") == 0) { 1848 const ir_function_signature *sig; 1849 exec_list empty; 1850 1851 sig = ir->matching_signature(&empty); 1852 1853 assert(sig); 1854 1855 foreach_iter(exec_list_iterator, iter, sig->body) { 1856 ir_instruction *ir = (ir_instruction *)iter.get(); 1857 this->base_ir = ir; 1858 1859 ir->accept(this); 1860 } 1861 } 1862} 1863 1864void 1865fs_visitor::visit(ir_function_signature *ir) 1866{ 1867 assert(!"not reached"); 1868 (void)ir; 1869} 1870 1871fs_inst * 1872fs_visitor::emit(fs_inst inst) 1873{ 1874 fs_inst *list_inst = new(mem_ctx) fs_inst; 1875 *list_inst = inst; 1876 1877 list_inst->annotation = this->current_annotation; 1878 list_inst->ir = this->base_ir; 1879 1880 this->instructions.push_tail(list_inst); 1881 1882 return list_inst; 1883} 1884 1885/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */ 1886void 1887fs_visitor::emit_dummy_fs() 1888{ 1889 /* Everyone's favorite color. */ 1890 emit(BRW_OPCODE_MOV, fs_reg(MRF, 2), fs_reg(1.0f)); 1891 emit(BRW_OPCODE_MOV, fs_reg(MRF, 3), fs_reg(0.0f)); 1892 emit(BRW_OPCODE_MOV, fs_reg(MRF, 4), fs_reg(1.0f)); 1893 emit(BRW_OPCODE_MOV, fs_reg(MRF, 5), fs_reg(0.0f)); 1894 1895 fs_inst *write; 1896 write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0)); 1897 write->base_mrf = 0; 1898} 1899 1900/* The register location here is relative to the start of the URB 1901 * data. It will get adjusted to be a real location before 1902 * generate_code() time. 1903 */ 1904struct brw_reg 1905fs_visitor::interp_reg(int location, int channel) 1906{ 1907 int regnr = urb_setup[location] * 2 + channel / 2; 1908 int stride = (channel & 1) * 4; 1909 1910 assert(urb_setup[location] != -1); 1911 1912 return brw_vec1_grf(regnr, stride); 1913} 1914 1915/** Emits the interpolation for the varying inputs. */ 1916void 1917fs_visitor::emit_interpolation_setup_gen4() 1918{ 1919 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW); 1920 1921 this->current_annotation = "compute pixel centers"; 1922 this->pixel_x = fs_reg(this, glsl_type::uint_type); 1923 this->pixel_y = fs_reg(this, glsl_type::uint_type); 1924 this->pixel_x.type = BRW_REGISTER_TYPE_UW; 1925 this->pixel_y.type = BRW_REGISTER_TYPE_UW; 1926 emit(BRW_OPCODE_ADD, 1927 this->pixel_x, 1928 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)), 1929 fs_reg(brw_imm_v(0x10101010))); 1930 emit(BRW_OPCODE_ADD, 1931 this->pixel_y, 1932 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)), 1933 fs_reg(brw_imm_v(0x11001100))); 1934 1935 this->current_annotation = "compute pixel deltas from v0"; 1936 if (brw->has_pln) { 1937 this->delta_x = fs_reg(this, glsl_type::vec2_type); 1938 this->delta_y = this->delta_x; 1939 this->delta_y.reg_offset++; 1940 } else { 1941 this->delta_x = fs_reg(this, glsl_type::float_type); 1942 this->delta_y = fs_reg(this, glsl_type::float_type); 1943 } 1944 emit(BRW_OPCODE_ADD, this->delta_x, 1945 this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0)))); 1946 emit(BRW_OPCODE_ADD, this->delta_y, 1947 this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1)))); 1948 1949 this->current_annotation = "compute pos.w and 1/pos.w"; 1950 /* Compute wpos.w. It's always in our setup, since it's needed to 1951 * interpolate the other attributes. 1952 */ 1953 this->wpos_w = fs_reg(this, glsl_type::float_type); 1954 emit(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y, 1955 interp_reg(FRAG_ATTRIB_WPOS, 3)); 1956 /* Compute the pixel 1/W value from wpos.w. */ 1957 this->pixel_w = fs_reg(this, glsl_type::float_type); 1958 emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w); 1959 this->current_annotation = NULL; 1960} 1961 1962/** Emits the interpolation for the varying inputs. */ 1963void 1964fs_visitor::emit_interpolation_setup_gen6() 1965{ 1966 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW); 1967 1968 /* If the pixel centers end up used, the setup is the same as for gen4. */ 1969 this->current_annotation = "compute pixel centers"; 1970 fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type); 1971 fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type); 1972 int_pixel_x.type = BRW_REGISTER_TYPE_UW; 1973 int_pixel_y.type = BRW_REGISTER_TYPE_UW; 1974 emit(BRW_OPCODE_ADD, 1975 int_pixel_x, 1976 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)), 1977 fs_reg(brw_imm_v(0x10101010))); 1978 emit(BRW_OPCODE_ADD, 1979 int_pixel_y, 1980 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)), 1981 fs_reg(brw_imm_v(0x11001100))); 1982 1983 /* As of gen6, we can no longer mix float and int sources. We have 1984 * to turn the integer pixel centers into floats for their actual 1985 * use. 1986 */ 1987 this->pixel_x = fs_reg(this, glsl_type::float_type); 1988 this->pixel_y = fs_reg(this, glsl_type::float_type); 1989 emit(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x); 1990 emit(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y); 1991 1992 this->current_annotation = "compute pos.w"; 1993 this->pixel_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0)); 1994 this->wpos_w = fs_reg(this, glsl_type::float_type); 1995 emit_math(FS_OPCODE_RCP, this->wpos_w, this->pixel_w); 1996 1997 this->delta_x = fs_reg(brw_vec8_grf(2, 0)); 1998 this->delta_y = fs_reg(brw_vec8_grf(3, 0)); 1999 2000 this->current_annotation = NULL; 2001} 2002 2003void 2004fs_visitor::emit_fb_writes() 2005{ 2006 this->current_annotation = "FB write header"; 2007 GLboolean header_present = GL_TRUE; 2008 int nr = 0; 2009 2010 if (intel->gen >= 6 && 2011 !this->kill_emitted && 2012 c->key.nr_color_regions == 1) { 2013 header_present = false; 2014 } 2015 2016 if (header_present) { 2017 /* m0, m1 header */ 2018 nr += 2; 2019 } 2020 2021 if (c->aa_dest_stencil_reg) { 2022 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), 2023 fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0))); 2024 } 2025 2026 /* Reserve space for color. It'll be filled in per MRT below. */ 2027 int color_mrf = nr; 2028 nr += 4; 2029 2030 if (c->source_depth_to_render_target) { 2031 if (c->computes_depth) { 2032 /* Hand over gl_FragDepth. */ 2033 assert(this->frag_depth); 2034 fs_reg depth = *(variable_storage(this->frag_depth)); 2035 2036 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth); 2037 } else { 2038 /* Pass through the payload depth. */ 2039 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), 2040 fs_reg(brw_vec8_grf(c->source_depth_reg, 0))); 2041 } 2042 } 2043 2044 if (c->dest_depth_reg) { 2045 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), 2046 fs_reg(brw_vec8_grf(c->dest_depth_reg, 0))); 2047 } 2048 2049 fs_reg color = reg_undef; 2050 if (this->frag_color) 2051 color = *(variable_storage(this->frag_color)); 2052 else if (this->frag_data) { 2053 color = *(variable_storage(this->frag_data)); 2054 color.type = BRW_REGISTER_TYPE_F; 2055 } 2056 2057 for (int target = 0; target < c->key.nr_color_regions; target++) { 2058 this->current_annotation = ralloc_asprintf(this->mem_ctx, 2059 "FB write target %d", 2060 target); 2061 if (this->frag_color || this->frag_data) { 2062 for (int i = 0; i < 4; i++) { 2063 emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + i), color); 2064 color.reg_offset++; 2065 } 2066 } 2067 2068 if (this->frag_color) 2069 color.reg_offset -= 4; 2070 2071 fs_inst *inst = emit(FS_OPCODE_FB_WRITE); 2072 inst->target = target; 2073 inst->base_mrf = 0; 2074 inst->mlen = nr; 2075 if (target == c->key.nr_color_regions - 1) 2076 inst->eot = true; 2077 inst->header_present = header_present; 2078 } 2079 2080 if (c->key.nr_color_regions == 0) { 2081 if (c->key.alpha_test && (this->frag_color || this->frag_data)) { 2082 /* If the alpha test is enabled but there's no color buffer, 2083 * we still need to send alpha out the pipeline to our null 2084 * renderbuffer. 2085 */ 2086 color.reg_offset += 3; 2087 emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + 3), color); 2088 } 2089 2090 fs_inst *inst = emit(FS_OPCODE_FB_WRITE); 2091 inst->base_mrf = 0; 2092 inst->mlen = nr; 2093 inst->eot = true; 2094 inst->header_present = header_present; 2095 } 2096 2097 this->current_annotation = NULL; 2098} 2099 2100void 2101fs_visitor::generate_fb_write(fs_inst *inst) 2102{ 2103 GLboolean eot = inst->eot; 2104 struct brw_reg implied_header; 2105 2106 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied 2107 * move, here's g1. 2108 */ 2109 brw_push_insn_state(p); 2110 brw_set_mask_control(p, BRW_MASK_DISABLE); 2111 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2112 2113 if (inst->header_present) { 2114 if (intel->gen >= 6) { 2115 brw_MOV(p, 2116 brw_message_reg(inst->base_mrf), 2117 brw_vec8_grf(0, 0)); 2118 2119 if (inst->target > 0) { 2120 /* Set the render target index for choosing BLEND_STATE. */ 2121 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2), 2122 BRW_REGISTER_TYPE_UD), 2123 brw_imm_ud(inst->target)); 2124 } 2125 2126 /* Clear viewport index, render target array index. */ 2127 brw_AND(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 0), 2128 BRW_REGISTER_TYPE_UD), 2129 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), 2130 brw_imm_ud(0xf7ff)); 2131 2132 implied_header = brw_null_reg(); 2133 } else { 2134 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); 2135 } 2136 2137 brw_MOV(p, 2138 brw_message_reg(inst->base_mrf + 1), 2139 brw_vec8_grf(1, 0)); 2140 } else { 2141 implied_header = brw_null_reg(); 2142 } 2143 2144 brw_pop_insn_state(p); 2145 2146 brw_fb_WRITE(p, 2147 8, /* dispatch_width */ 2148 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW), 2149 inst->base_mrf, 2150 implied_header, 2151 inst->target, 2152 inst->mlen, 2153 0, 2154 eot, 2155 inst->header_present); 2156} 2157 2158void 2159fs_visitor::generate_linterp(fs_inst *inst, 2160 struct brw_reg dst, struct brw_reg *src) 2161{ 2162 struct brw_reg delta_x = src[0]; 2163 struct brw_reg delta_y = src[1]; 2164 struct brw_reg interp = src[2]; 2165 2166 if (brw->has_pln && 2167 delta_y.nr == delta_x.nr + 1 && 2168 (intel->gen >= 6 || (delta_x.nr & 1) == 0)) { 2169 brw_PLN(p, dst, interp, delta_x); 2170 } else { 2171 brw_LINE(p, brw_null_reg(), interp, delta_x); 2172 brw_MAC(p, dst, suboffset(interp, 1), delta_y); 2173 } 2174} 2175 2176void 2177fs_visitor::generate_math(fs_inst *inst, 2178 struct brw_reg dst, struct brw_reg *src) 2179{ 2180 int op; 2181 2182 switch (inst->opcode) { 2183 case FS_OPCODE_RCP: 2184 op = BRW_MATH_FUNCTION_INV; 2185 break; 2186 case FS_OPCODE_RSQ: 2187 op = BRW_MATH_FUNCTION_RSQ; 2188 break; 2189 case FS_OPCODE_SQRT: 2190 op = BRW_MATH_FUNCTION_SQRT; 2191 break; 2192 case FS_OPCODE_EXP2: 2193 op = BRW_MATH_FUNCTION_EXP; 2194 break; 2195 case FS_OPCODE_LOG2: 2196 op = BRW_MATH_FUNCTION_LOG; 2197 break; 2198 case FS_OPCODE_POW: 2199 op = BRW_MATH_FUNCTION_POW; 2200 break; 2201 case FS_OPCODE_SIN: 2202 op = BRW_MATH_FUNCTION_SIN; 2203 break; 2204 case FS_OPCODE_COS: 2205 op = BRW_MATH_FUNCTION_COS; 2206 break; 2207 default: 2208 assert(!"not reached: unknown math function"); 2209 op = 0; 2210 break; 2211 } 2212 2213 if (intel->gen >= 6) { 2214 assert(inst->mlen == 0); 2215 2216 if (inst->opcode == FS_OPCODE_POW) { 2217 brw_math2(p, dst, op, src[0], src[1]); 2218 } else { 2219 brw_math(p, dst, 2220 op, 2221 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 2222 BRW_MATH_SATURATE_NONE, 2223 0, src[0], 2224 BRW_MATH_DATA_VECTOR, 2225 BRW_MATH_PRECISION_FULL); 2226 } 2227 } else { 2228 assert(inst->mlen >= 1); 2229 2230 brw_math(p, dst, 2231 op, 2232 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 2233 BRW_MATH_SATURATE_NONE, 2234 inst->base_mrf, src[0], 2235 BRW_MATH_DATA_VECTOR, 2236 BRW_MATH_PRECISION_FULL); 2237 } 2238} 2239 2240void 2241fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 2242{ 2243 int msg_type = -1; 2244 int rlen = 4; 2245 uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; 2246 2247 if (intel->gen >= 5) { 2248 switch (inst->opcode) { 2249 case FS_OPCODE_TEX: 2250 if (inst->shadow_compare) { 2251 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE; 2252 } else { 2253 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE; 2254 } 2255 break; 2256 case FS_OPCODE_TXB: 2257 if (inst->shadow_compare) { 2258 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE; 2259 } else { 2260 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS; 2261 } 2262 break; 2263 case FS_OPCODE_TXL: 2264 if (inst->shadow_compare) { 2265 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE; 2266 } else { 2267 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD; 2268 } 2269 break; 2270 case FS_OPCODE_TXD: 2271 assert(!"TXD isn't supported on gen5+ yet."); 2272 break; 2273 } 2274 } else { 2275 switch (inst->opcode) { 2276 case FS_OPCODE_TEX: 2277 /* Note that G45 and older determines shadow compare and dispatch width 2278 * from message length for most messages. 2279 */ 2280 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE; 2281 if (inst->shadow_compare) { 2282 assert(inst->mlen == 6); 2283 } else { 2284 assert(inst->mlen <= 4); 2285 } 2286 break; 2287 case FS_OPCODE_TXB: 2288 if (inst->shadow_compare) { 2289 assert(inst->mlen == 6); 2290 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE; 2291 } else { 2292 assert(inst->mlen == 9); 2293 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS; 2294 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 2295 } 2296 break; 2297 case FS_OPCODE_TXL: 2298 if (inst->shadow_compare) { 2299 assert(inst->mlen == 6); 2300 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE; 2301 } else { 2302 assert(inst->mlen == 9); 2303 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD; 2304 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 2305 } 2306 break; 2307 case FS_OPCODE_TXD: 2308 assert(!"TXD isn't supported on gen4 yet."); 2309 break; 2310 } 2311 } 2312 assert(msg_type != -1); 2313 2314 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) { 2315 rlen = 8; 2316 dst = vec16(dst); 2317 } 2318 2319 brw_SAMPLE(p, 2320 retype(dst, BRW_REGISTER_TYPE_UW), 2321 inst->base_mrf, 2322 src, 2323 SURF_INDEX_TEXTURE(inst->sampler), 2324 inst->sampler, 2325 WRITEMASK_XYZW, 2326 msg_type, 2327 rlen, 2328 inst->mlen, 2329 0, 2330 1, 2331 simd_mode); 2332} 2333 2334 2335/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input 2336 * looking like: 2337 * 2338 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br 2339 * 2340 * and we're trying to produce: 2341 * 2342 * DDX DDY 2343 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl) 2344 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br) 2345 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl) 2346 * (ss0.br - ss0.bl) (ss0.tr - ss0.br) 2347 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl) 2348 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br) 2349 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl) 2350 * (ss1.br - ss1.bl) (ss1.tr - ss1.br) 2351 * 2352 * and add another set of two more subspans if in 16-pixel dispatch mode. 2353 * 2354 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result 2355 * for each pair, and vertstride = 2 jumps us 2 elements after processing a 2356 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled 2357 * between each other. We could probably do it like ddx and swizzle the right 2358 * order later, but bail for now and just produce 2359 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4) 2360 */ 2361void 2362fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 2363{ 2364 struct brw_reg src0 = brw_reg(src.file, src.nr, 1, 2365 BRW_REGISTER_TYPE_F, 2366 BRW_VERTICAL_STRIDE_2, 2367 BRW_WIDTH_2, 2368 BRW_HORIZONTAL_STRIDE_0, 2369 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2370 struct brw_reg src1 = brw_reg(src.file, src.nr, 0, 2371 BRW_REGISTER_TYPE_F, 2372 BRW_VERTICAL_STRIDE_2, 2373 BRW_WIDTH_2, 2374 BRW_HORIZONTAL_STRIDE_0, 2375 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2376 brw_ADD(p, dst, src0, negate(src1)); 2377} 2378 2379void 2380fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 2381{ 2382 struct brw_reg src0 = brw_reg(src.file, src.nr, 0, 2383 BRW_REGISTER_TYPE_F, 2384 BRW_VERTICAL_STRIDE_4, 2385 BRW_WIDTH_4, 2386 BRW_HORIZONTAL_STRIDE_0, 2387 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2388 struct brw_reg src1 = brw_reg(src.file, src.nr, 2, 2389 BRW_REGISTER_TYPE_F, 2390 BRW_VERTICAL_STRIDE_4, 2391 BRW_WIDTH_4, 2392 BRW_HORIZONTAL_STRIDE_0, 2393 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2394 brw_ADD(p, dst, src0, negate(src1)); 2395} 2396 2397void 2398fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask) 2399{ 2400 if (intel->gen >= 6) { 2401 /* Gen6 no longer has the mask reg for us to just read the 2402 * active channels from. However, cmp updates just the channels 2403 * of the flag reg that are enabled, so we can get at the 2404 * channel enables that way. In this step, make a reg of ones 2405 * we'll compare to. 2406 */ 2407 brw_MOV(p, mask, brw_imm_ud(1)); 2408 } else { 2409 brw_push_insn_state(p); 2410 brw_set_mask_control(p, BRW_MASK_DISABLE); 2411 brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */ 2412 brw_pop_insn_state(p); 2413 } 2414} 2415 2416void 2417fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask) 2418{ 2419 if (intel->gen >= 6) { 2420 struct brw_reg f0 = brw_flag_reg(); 2421 struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); 2422 2423 brw_push_insn_state(p); 2424 brw_set_mask_control(p, BRW_MASK_DISABLE); 2425 brw_MOV(p, f0, brw_imm_uw(0xffff)); /* inactive channels undiscarded */ 2426 brw_pop_insn_state(p); 2427 2428 brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), 2429 BRW_CONDITIONAL_Z, mask, brw_imm_ud(0)); /* active channels fail test */ 2430 /* Undo CMP's whacking of predication*/ 2431 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 2432 2433 brw_push_insn_state(p); 2434 brw_set_mask_control(p, BRW_MASK_DISABLE); 2435 brw_AND(p, g1, f0, g1); 2436 brw_pop_insn_state(p); 2437 } else { 2438 struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); 2439 2440 mask = brw_uw1_reg(mask.file, mask.nr, 0); 2441 2442 brw_push_insn_state(p); 2443 brw_set_mask_control(p, BRW_MASK_DISABLE); 2444 brw_AND(p, g0, mask, g0); 2445 brw_pop_insn_state(p); 2446 } 2447} 2448 2449void 2450fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src) 2451{ 2452 assert(inst->mlen != 0); 2453 2454 brw_MOV(p, 2455 retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD), 2456 retype(src, BRW_REGISTER_TYPE_UD)); 2457 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1, 2458 inst->offset); 2459} 2460 2461void 2462fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst) 2463{ 2464 assert(inst->mlen != 0); 2465 2466 /* Clear any post destination dependencies that would be ignored by 2467 * the block read. See the B-Spec for pre-gen5 send instruction. 2468 * 2469 * This could use a better solution, since texture sampling and 2470 * math reads could potentially run into it as well -- anywhere 2471 * that we have a SEND with a destination that is a register that 2472 * was written but not read within the last N instructions (what's 2473 * N? unsure). This is rare because of dead code elimination, but 2474 * not impossible. 2475 */ 2476 if (intel->gen == 4 && !intel->is_g4x) 2477 brw_MOV(p, brw_null_reg(), dst); 2478 2479 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1, 2480 inst->offset); 2481 2482 if (intel->gen == 4 && !intel->is_g4x) { 2483 /* gen4 errata: destination from a send can't be used as a 2484 * destination until it's been read. Just read it so we don't 2485 * have to worry. 2486 */ 2487 brw_MOV(p, brw_null_reg(), dst); 2488 } 2489} 2490 2491 2492void 2493fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst) 2494{ 2495 assert(inst->mlen != 0); 2496 2497 /* Clear any post destination dependencies that would be ignored by 2498 * the block read. See the B-Spec for pre-gen5 send instruction. 2499 * 2500 * This could use a better solution, since texture sampling and 2501 * math reads could potentially run into it as well -- anywhere 2502 * that we have a SEND with a destination that is a register that 2503 * was written but not read within the last N instructions (what's 2504 * N? unsure). This is rare because of dead code elimination, but 2505 * not impossible. 2506 */ 2507 if (intel->gen == 4 && !intel->is_g4x) 2508 brw_MOV(p, brw_null_reg(), dst); 2509 2510 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf), 2511 inst->offset, SURF_INDEX_FRAG_CONST_BUFFER); 2512 2513 if (intel->gen == 4 && !intel->is_g4x) { 2514 /* gen4 errata: destination from a send can't be used as a 2515 * destination until it's been read. Just read it so we don't 2516 * have to worry. 2517 */ 2518 brw_MOV(p, brw_null_reg(), dst); 2519 } 2520} 2521 2522/** 2523 * To be called after the last _mesa_add_state_reference() call, to 2524 * set up prog_data.param[] for assign_curb_setup() and 2525 * setup_pull_constants(). 2526 */ 2527void 2528fs_visitor::setup_paramvalues_refs() 2529{ 2530 /* Set up the pointers to ParamValues now that that array is finalized. */ 2531 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 2532 c->prog_data.param[i] = 2533 fp->Base.Parameters->ParameterValues[this->param_index[i]] + 2534 this->param_offset[i]; 2535 } 2536} 2537 2538void 2539fs_visitor::assign_curb_setup() 2540{ 2541 c->prog_data.first_curbe_grf = c->nr_payload_regs; 2542 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8; 2543 2544 /* Map the offsets in the UNIFORM file to fixed HW regs. */ 2545 foreach_iter(exec_list_iterator, iter, this->instructions) { 2546 fs_inst *inst = (fs_inst *)iter.get(); 2547 2548 for (unsigned int i = 0; i < 3; i++) { 2549 if (inst->src[i].file == UNIFORM) { 2550 int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 2551 struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf + 2552 constant_nr / 8, 2553 constant_nr % 8); 2554 2555 inst->src[i].file = FIXED_HW_REG; 2556 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type); 2557 } 2558 } 2559 } 2560} 2561 2562void 2563fs_visitor::calculate_urb_setup() 2564{ 2565 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 2566 urb_setup[i] = -1; 2567 } 2568 2569 int urb_next = 0; 2570 /* Figure out where each of the incoming setup attributes lands. */ 2571 if (intel->gen >= 6) { 2572 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 2573 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) { 2574 urb_setup[i] = urb_next++; 2575 } 2576 } 2577 } else { 2578 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */ 2579 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) { 2580 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) { 2581 int fp_index; 2582 2583 if (i >= VERT_RESULT_VAR0) 2584 fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0); 2585 else if (i <= VERT_RESULT_TEX7) 2586 fp_index = i; 2587 else 2588 fp_index = -1; 2589 2590 if (fp_index >= 0) 2591 urb_setup[fp_index] = urb_next++; 2592 } 2593 } 2594 } 2595 2596 /* Each attribute is 4 setup channels, each of which is half a reg. */ 2597 c->prog_data.urb_read_length = urb_next * 2; 2598} 2599 2600void 2601fs_visitor::assign_urb_setup() 2602{ 2603 int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length; 2604 2605 /* Offset all the urb_setup[] index by the actual position of the 2606 * setup regs, now that the location of the constants has been chosen. 2607 */ 2608 foreach_iter(exec_list_iterator, iter, this->instructions) { 2609 fs_inst *inst = (fs_inst *)iter.get(); 2610 2611 if (inst->opcode == FS_OPCODE_LINTERP) { 2612 assert(inst->src[2].file == FIXED_HW_REG); 2613 inst->src[2].fixed_hw_reg.nr += urb_start; 2614 } 2615 2616 if (inst->opcode == FS_OPCODE_CINTERP) { 2617 assert(inst->src[0].file == FIXED_HW_REG); 2618 inst->src[0].fixed_hw_reg.nr += urb_start; 2619 } 2620 } 2621 2622 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length; 2623} 2624 2625/** 2626 * Split large virtual GRFs into separate components if we can. 2627 * 2628 * This is mostly duplicated with what brw_fs_vector_splitting does, 2629 * but that's really conservative because it's afraid of doing 2630 * splitting that doesn't result in real progress after the rest of 2631 * the optimization phases, which would cause infinite looping in 2632 * optimization. We can do it once here, safely. This also has the 2633 * opportunity to split interpolated values, or maybe even uniforms, 2634 * which we don't have at the IR level. 2635 * 2636 * We want to split, because virtual GRFs are what we register 2637 * allocate and spill (due to contiguousness requirements for some 2638 * instructions), and they're what we naturally generate in the 2639 * codegen process, but most virtual GRFs don't actually need to be 2640 * contiguous sets of GRFs. If we split, we'll end up with reduced 2641 * live intervals and better dead code elimination and coalescing. 2642 */ 2643void 2644fs_visitor::split_virtual_grfs() 2645{ 2646 int num_vars = this->virtual_grf_next; 2647 bool split_grf[num_vars]; 2648 int new_virtual_grf[num_vars]; 2649 2650 /* Try to split anything > 0 sized. */ 2651 for (int i = 0; i < num_vars; i++) { 2652 if (this->virtual_grf_sizes[i] != 1) 2653 split_grf[i] = true; 2654 else 2655 split_grf[i] = false; 2656 } 2657 2658 if (brw->has_pln) { 2659 /* PLN opcodes rely on the delta_xy being contiguous. */ 2660 split_grf[this->delta_x.reg] = false; 2661 } 2662 2663 foreach_iter(exec_list_iterator, iter, this->instructions) { 2664 fs_inst *inst = (fs_inst *)iter.get(); 2665 2666 /* Texturing produces 4 contiguous registers, so no splitting. */ 2667 if (inst->is_tex()) { 2668 split_grf[inst->dst.reg] = false; 2669 } 2670 } 2671 2672 /* Allocate new space for split regs. Note that the virtual 2673 * numbers will be contiguous. 2674 */ 2675 for (int i = 0; i < num_vars; i++) { 2676 if (split_grf[i]) { 2677 new_virtual_grf[i] = virtual_grf_alloc(1); 2678 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) { 2679 int reg = virtual_grf_alloc(1); 2680 assert(reg == new_virtual_grf[i] + j - 1); 2681 (void) reg; 2682 } 2683 this->virtual_grf_sizes[i] = 1; 2684 } 2685 } 2686 2687 foreach_iter(exec_list_iterator, iter, this->instructions) { 2688 fs_inst *inst = (fs_inst *)iter.get(); 2689 2690 if (inst->dst.file == GRF && 2691 split_grf[inst->dst.reg] && 2692 inst->dst.reg_offset != 0) { 2693 inst->dst.reg = (new_virtual_grf[inst->dst.reg] + 2694 inst->dst.reg_offset - 1); 2695 inst->dst.reg_offset = 0; 2696 } 2697 for (int i = 0; i < 3; i++) { 2698 if (inst->src[i].file == GRF && 2699 split_grf[inst->src[i].reg] && 2700 inst->src[i].reg_offset != 0) { 2701 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] + 2702 inst->src[i].reg_offset - 1); 2703 inst->src[i].reg_offset = 0; 2704 } 2705 } 2706 } 2707 this->live_intervals_valid = false; 2708} 2709 2710/** 2711 * Choose accesses from the UNIFORM file to demote to using the pull 2712 * constant buffer. 2713 * 2714 * We allow a fragment shader to have more than the specified minimum 2715 * maximum number of fragment shader uniform components (64). If 2716 * there are too many of these, they'd fill up all of register space. 2717 * So, this will push some of them out to the pull constant buffer and 2718 * update the program to load them. 2719 */ 2720void 2721fs_visitor::setup_pull_constants() 2722{ 2723 /* Only allow 16 registers (128 uniform components) as push constants. */ 2724 unsigned int max_uniform_components = 16 * 8; 2725 if (c->prog_data.nr_params <= max_uniform_components) 2726 return; 2727 2728 /* Just demote the end of the list. We could probably do better 2729 * here, demoting things that are rarely used in the program first. 2730 */ 2731 int pull_uniform_base = max_uniform_components; 2732 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base; 2733 2734 foreach_iter(exec_list_iterator, iter, this->instructions) { 2735 fs_inst *inst = (fs_inst *)iter.get(); 2736 2737 for (int i = 0; i < 3; i++) { 2738 if (inst->src[i].file != UNIFORM) 2739 continue; 2740 2741 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 2742 if (uniform_nr < pull_uniform_base) 2743 continue; 2744 2745 fs_reg dst = fs_reg(this, glsl_type::float_type); 2746 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD, 2747 dst); 2748 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15; 2749 pull->ir = inst->ir; 2750 pull->annotation = inst->annotation; 2751 pull->base_mrf = 14; 2752 pull->mlen = 1; 2753 2754 inst->insert_before(pull); 2755 2756 inst->src[i].file = GRF; 2757 inst->src[i].reg = dst.reg; 2758 inst->src[i].reg_offset = 0; 2759 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3; 2760 } 2761 } 2762 2763 for (int i = 0; i < pull_uniform_count; i++) { 2764 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i]; 2765 c->prog_data.pull_param_convert[i] = 2766 c->prog_data.param_convert[pull_uniform_base + i]; 2767 } 2768 c->prog_data.nr_params -= pull_uniform_count; 2769 c->prog_data.nr_pull_params = pull_uniform_count; 2770} 2771 2772void 2773fs_visitor::calculate_live_intervals() 2774{ 2775 int num_vars = this->virtual_grf_next; 2776 int *def = ralloc_array(mem_ctx, int, num_vars); 2777 int *use = ralloc_array(mem_ctx, int, num_vars); 2778 int loop_depth = 0; 2779 int loop_start = 0; 2780 int bb_header_ip = 0; 2781 2782 if (this->live_intervals_valid) 2783 return; 2784 2785 for (int i = 0; i < num_vars; i++) { 2786 def[i] = MAX_INSTRUCTION; 2787 use[i] = -1; 2788 } 2789 2790 int ip = 0; 2791 foreach_iter(exec_list_iterator, iter, this->instructions) { 2792 fs_inst *inst = (fs_inst *)iter.get(); 2793 2794 if (inst->opcode == BRW_OPCODE_DO) { 2795 if (loop_depth++ == 0) 2796 loop_start = ip; 2797 } else if (inst->opcode == BRW_OPCODE_WHILE) { 2798 loop_depth--; 2799 2800 if (loop_depth == 0) { 2801 /* Patches up the use of vars marked for being live across 2802 * the whole loop. 2803 */ 2804 for (int i = 0; i < num_vars; i++) { 2805 if (use[i] == loop_start) { 2806 use[i] = ip; 2807 } 2808 } 2809 } 2810 } else { 2811 for (unsigned int i = 0; i < 3; i++) { 2812 if (inst->src[i].file == GRF && inst->src[i].reg != 0) { 2813 int reg = inst->src[i].reg; 2814 2815 if (!loop_depth) { 2816 use[reg] = ip; 2817 } else { 2818 def[reg] = MIN2(loop_start, def[reg]); 2819 use[reg] = loop_start; 2820 2821 /* Nobody else is going to go smash our start to 2822 * later in the loop now, because def[reg] now 2823 * points before the bb header. 2824 */ 2825 } 2826 } 2827 } 2828 if (inst->dst.file == GRF && inst->dst.reg != 0) { 2829 int reg = inst->dst.reg; 2830 2831 if (!loop_depth) { 2832 def[reg] = MIN2(def[reg], ip); 2833 } else { 2834 def[reg] = MIN2(def[reg], loop_start); 2835 } 2836 } 2837 } 2838 2839 ip++; 2840 2841 /* Set the basic block header IP. This is used for determining 2842 * if a complete def of single-register virtual GRF in a loop 2843 * dominates a use in the same basic block. It's a quick way to 2844 * reduce the live interval range of most register used in a 2845 * loop. 2846 */ 2847 if (inst->opcode == BRW_OPCODE_IF || 2848 inst->opcode == BRW_OPCODE_ELSE || 2849 inst->opcode == BRW_OPCODE_ENDIF || 2850 inst->opcode == BRW_OPCODE_DO || 2851 inst->opcode == BRW_OPCODE_WHILE || 2852 inst->opcode == BRW_OPCODE_BREAK || 2853 inst->opcode == BRW_OPCODE_CONTINUE) { 2854 bb_header_ip = ip; 2855 } 2856 } 2857 2858 ralloc_free(this->virtual_grf_def); 2859 ralloc_free(this->virtual_grf_use); 2860 this->virtual_grf_def = def; 2861 this->virtual_grf_use = use; 2862 2863 this->live_intervals_valid = true; 2864} 2865 2866/** 2867 * Attempts to move immediate constants into the immediate 2868 * constant slot of following instructions. 2869 * 2870 * Immediate constants are a bit tricky -- they have to be in the last 2871 * operand slot, you can't do abs/negate on them, 2872 */ 2873 2874bool 2875fs_visitor::propagate_constants() 2876{ 2877 bool progress = false; 2878 2879 calculate_live_intervals(); 2880 2881 foreach_iter(exec_list_iterator, iter, this->instructions) { 2882 fs_inst *inst = (fs_inst *)iter.get(); 2883 2884 if (inst->opcode != BRW_OPCODE_MOV || 2885 inst->predicated || 2886 inst->dst.file != GRF || inst->src[0].file != IMM || 2887 inst->dst.type != inst->src[0].type) 2888 continue; 2889 2890 /* Don't bother with cases where we should have had the 2891 * operation on the constant folded in GLSL already. 2892 */ 2893 if (inst->saturate) 2894 continue; 2895 2896 /* Found a move of a constant to a GRF. Find anything else using the GRF 2897 * before it's written, and replace it with the constant if we can. 2898 */ 2899 exec_list_iterator scan_iter = iter; 2900 scan_iter.next(); 2901 for (; scan_iter.has_next(); scan_iter.next()) { 2902 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 2903 2904 if (scan_inst->opcode == BRW_OPCODE_DO || 2905 scan_inst->opcode == BRW_OPCODE_WHILE || 2906 scan_inst->opcode == BRW_OPCODE_ELSE || 2907 scan_inst->opcode == BRW_OPCODE_ENDIF) { 2908 break; 2909 } 2910 2911 for (int i = 2; i >= 0; i--) { 2912 if (scan_inst->src[i].file != GRF || 2913 scan_inst->src[i].reg != inst->dst.reg || 2914 scan_inst->src[i].reg_offset != inst->dst.reg_offset) 2915 continue; 2916 2917 /* Don't bother with cases where we should have had the 2918 * operation on the constant folded in GLSL already. 2919 */ 2920 if (scan_inst->src[i].negate || scan_inst->src[i].abs) 2921 continue; 2922 2923 switch (scan_inst->opcode) { 2924 case BRW_OPCODE_MOV: 2925 scan_inst->src[i] = inst->src[0]; 2926 progress = true; 2927 break; 2928 2929 case BRW_OPCODE_MUL: 2930 case BRW_OPCODE_ADD: 2931 if (i == 1) { 2932 scan_inst->src[i] = inst->src[0]; 2933 progress = true; 2934 } else if (i == 0 && scan_inst->src[1].file != IMM) { 2935 /* Fit this constant in by commuting the operands */ 2936 scan_inst->src[0] = scan_inst->src[1]; 2937 scan_inst->src[1] = inst->src[0]; 2938 progress = true; 2939 } 2940 break; 2941 case BRW_OPCODE_CMP: 2942 case BRW_OPCODE_SEL: 2943 if (i == 1) { 2944 scan_inst->src[i] = inst->src[0]; 2945 progress = true; 2946 } 2947 } 2948 } 2949 2950 if (scan_inst->dst.file == GRF && 2951 scan_inst->dst.reg == inst->dst.reg && 2952 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 2953 scan_inst->is_tex())) { 2954 break; 2955 } 2956 } 2957 } 2958 2959 if (progress) 2960 this->live_intervals_valid = false; 2961 2962 return progress; 2963} 2964/** 2965 * Must be called after calculate_live_intervales() to remove unused 2966 * writes to registers -- register allocation will fail otherwise 2967 * because something deffed but not used won't be considered to 2968 * interfere with other regs. 2969 */ 2970bool 2971fs_visitor::dead_code_eliminate() 2972{ 2973 bool progress = false; 2974 int pc = 0; 2975 2976 calculate_live_intervals(); 2977 2978 foreach_iter(exec_list_iterator, iter, this->instructions) { 2979 fs_inst *inst = (fs_inst *)iter.get(); 2980 2981 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) { 2982 inst->remove(); 2983 progress = true; 2984 } 2985 2986 pc++; 2987 } 2988 2989 if (progress) 2990 live_intervals_valid = false; 2991 2992 return progress; 2993} 2994 2995bool 2996fs_visitor::register_coalesce() 2997{ 2998 bool progress = false; 2999 int if_depth = 0; 3000 int loop_depth = 0; 3001 3002 foreach_iter(exec_list_iterator, iter, this->instructions) { 3003 fs_inst *inst = (fs_inst *)iter.get(); 3004 3005 /* Make sure that we dominate the instructions we're going to 3006 * scan for interfering with our coalescing, or we won't have 3007 * scanned enough to see if anything interferes with our 3008 * coalescing. We don't dominate the following instructions if 3009 * we're in a loop or an if block. 3010 */ 3011 switch (inst->opcode) { 3012 case BRW_OPCODE_DO: 3013 loop_depth++; 3014 break; 3015 case BRW_OPCODE_WHILE: 3016 loop_depth--; 3017 break; 3018 case BRW_OPCODE_IF: 3019 if_depth++; 3020 break; 3021 case BRW_OPCODE_ENDIF: 3022 if_depth--; 3023 break; 3024 } 3025 if (loop_depth || if_depth) 3026 continue; 3027 3028 if (inst->opcode != BRW_OPCODE_MOV || 3029 inst->predicated || 3030 inst->saturate || 3031 inst->dst.file != GRF || inst->src[0].file != GRF || 3032 inst->dst.type != inst->src[0].type) 3033 continue; 3034 3035 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate; 3036 3037 /* Found a move of a GRF to a GRF. Let's see if we can coalesce 3038 * them: check for no writes to either one until the exit of the 3039 * program. 3040 */ 3041 bool interfered = false; 3042 exec_list_iterator scan_iter = iter; 3043 scan_iter.next(); 3044 for (; scan_iter.has_next(); scan_iter.next()) { 3045 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 3046 3047 if (scan_inst->dst.file == GRF) { 3048 if (scan_inst->dst.reg == inst->dst.reg && 3049 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 3050 scan_inst->is_tex())) { 3051 interfered = true; 3052 break; 3053 } 3054 if (scan_inst->dst.reg == inst->src[0].reg && 3055 (scan_inst->dst.reg_offset == inst->src[0].reg_offset || 3056 scan_inst->is_tex())) { 3057 interfered = true; 3058 break; 3059 } 3060 } 3061 3062 /* The gen6 MATH instruction can't handle source modifiers, so avoid 3063 * coalescing those for now. We should do something more specific. 3064 */ 3065 if (intel->gen == 6 && scan_inst->is_math() && has_source_modifiers) { 3066 interfered = true; 3067 break; 3068 } 3069 } 3070 if (interfered) { 3071 continue; 3072 } 3073 3074 /* Rewrite the later usage to point at the source of the move to 3075 * be removed. 3076 */ 3077 for (exec_list_iterator scan_iter = iter; scan_iter.has_next(); 3078 scan_iter.next()) { 3079 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 3080 3081 for (int i = 0; i < 3; i++) { 3082 if (scan_inst->src[i].file == GRF && 3083 scan_inst->src[i].reg == inst->dst.reg && 3084 scan_inst->src[i].reg_offset == inst->dst.reg_offset) { 3085 scan_inst->src[i].reg = inst->src[0].reg; 3086 scan_inst->src[i].reg_offset = inst->src[0].reg_offset; 3087 scan_inst->src[i].abs |= inst->src[0].abs; 3088 scan_inst->src[i].negate ^= inst->src[0].negate; 3089 scan_inst->src[i].smear = inst->src[0].smear; 3090 } 3091 } 3092 } 3093 3094 inst->remove(); 3095 progress = true; 3096 } 3097 3098 if (progress) 3099 live_intervals_valid = false; 3100 3101 return progress; 3102} 3103 3104 3105bool 3106fs_visitor::compute_to_mrf() 3107{ 3108 bool progress = false; 3109 int next_ip = 0; 3110 3111 calculate_live_intervals(); 3112 3113 foreach_iter(exec_list_iterator, iter, this->instructions) { 3114 fs_inst *inst = (fs_inst *)iter.get(); 3115 3116 int ip = next_ip; 3117 next_ip++; 3118 3119 if (inst->opcode != BRW_OPCODE_MOV || 3120 inst->predicated || 3121 inst->dst.file != MRF || inst->src[0].file != GRF || 3122 inst->dst.type != inst->src[0].type || 3123 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1) 3124 continue; 3125 3126 /* Can't compute-to-MRF this GRF if someone else was going to 3127 * read it later. 3128 */ 3129 if (this->virtual_grf_use[inst->src[0].reg] > ip) 3130 continue; 3131 3132 /* Found a move of a GRF to a MRF. Let's see if we can go 3133 * rewrite the thing that made this GRF to write into the MRF. 3134 */ 3135 fs_inst *scan_inst; 3136 for (scan_inst = (fs_inst *)inst->prev; 3137 scan_inst->prev != NULL; 3138 scan_inst = (fs_inst *)scan_inst->prev) { 3139 if (scan_inst->dst.file == GRF && 3140 scan_inst->dst.reg == inst->src[0].reg) { 3141 /* Found the last thing to write our reg we want to turn 3142 * into a compute-to-MRF. 3143 */ 3144 3145 if (scan_inst->is_tex()) { 3146 /* texturing writes several continuous regs, so we can't 3147 * compute-to-mrf that. 3148 */ 3149 break; 3150 } 3151 3152 /* If it's predicated, it (probably) didn't populate all 3153 * the channels. 3154 */ 3155 if (scan_inst->predicated) 3156 break; 3157 3158 /* SEND instructions can't have MRF as a destination. */ 3159 if (scan_inst->mlen) 3160 break; 3161 3162 if (intel->gen >= 6) { 3163 /* gen6 math instructions must have the destination be 3164 * GRF, so no compute-to-MRF for them. 3165 */ 3166 if (scan_inst->is_math()) { 3167 break; 3168 } 3169 } 3170 3171 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 3172 /* Found the creator of our MRF's source value. */ 3173 scan_inst->dst.file = MRF; 3174 scan_inst->dst.hw_reg = inst->dst.hw_reg; 3175 scan_inst->saturate |= inst->saturate; 3176 inst->remove(); 3177 progress = true; 3178 } 3179 break; 3180 } 3181 3182 /* We don't handle flow control here. Most computation of 3183 * values that end up in MRFs are shortly before the MRF 3184 * write anyway. 3185 */ 3186 if (scan_inst->opcode == BRW_OPCODE_DO || 3187 scan_inst->opcode == BRW_OPCODE_WHILE || 3188 scan_inst->opcode == BRW_OPCODE_ELSE || 3189 scan_inst->opcode == BRW_OPCODE_ENDIF) { 3190 break; 3191 } 3192 3193 /* You can't read from an MRF, so if someone else reads our 3194 * MRF's source GRF that we wanted to rewrite, that stops us. 3195 */ 3196 bool interfered = false; 3197 for (int i = 0; i < 3; i++) { 3198 if (scan_inst->src[i].file == GRF && 3199 scan_inst->src[i].reg == inst->src[0].reg && 3200 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { 3201 interfered = true; 3202 } 3203 } 3204 if (interfered) 3205 break; 3206 3207 if (scan_inst->dst.file == MRF && 3208 scan_inst->dst.hw_reg == inst->dst.hw_reg) { 3209 /* Somebody else wrote our MRF here, so we can't can't 3210 * compute-to-MRF before that. 3211 */ 3212 break; 3213 } 3214 3215 if (scan_inst->mlen > 0) { 3216 /* Found a SEND instruction, which means that there are 3217 * live values in MRFs from base_mrf to base_mrf + 3218 * scan_inst->mlen - 1. Don't go pushing our MRF write up 3219 * above it. 3220 */ 3221 if (inst->dst.hw_reg >= scan_inst->base_mrf && 3222 inst->dst.hw_reg < scan_inst->base_mrf + scan_inst->mlen) { 3223 break; 3224 } 3225 } 3226 } 3227 } 3228 3229 return progress; 3230} 3231 3232/** 3233 * Walks through basic blocks, locking for repeated MRF writes and 3234 * removing the later ones. 3235 */ 3236bool 3237fs_visitor::remove_duplicate_mrf_writes() 3238{ 3239 fs_inst *last_mrf_move[16]; 3240 bool progress = false; 3241 3242 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 3243 3244 foreach_iter(exec_list_iterator, iter, this->instructions) { 3245 fs_inst *inst = (fs_inst *)iter.get(); 3246 3247 switch (inst->opcode) { 3248 case BRW_OPCODE_DO: 3249 case BRW_OPCODE_WHILE: 3250 case BRW_OPCODE_IF: 3251 case BRW_OPCODE_ELSE: 3252 case BRW_OPCODE_ENDIF: 3253 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 3254 continue; 3255 default: 3256 break; 3257 } 3258 3259 if (inst->opcode == BRW_OPCODE_MOV && 3260 inst->dst.file == MRF) { 3261 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg]; 3262 if (prev_inst && inst->equals(prev_inst)) { 3263 inst->remove(); 3264 progress = true; 3265 continue; 3266 } 3267 } 3268 3269 /* Clear out the last-write records for MRFs that were overwritten. */ 3270 if (inst->dst.file == MRF) { 3271 last_mrf_move[inst->dst.hw_reg] = NULL; 3272 } 3273 3274 if (inst->mlen > 0) { 3275 /* Found a SEND instruction, which will include two or fewer 3276 * implied MRF writes. We could do better here. 3277 */ 3278 for (int i = 0; i < implied_mrf_writes(inst); i++) { 3279 last_mrf_move[inst->base_mrf + i] = NULL; 3280 } 3281 } 3282 3283 /* Clear out any MRF move records whose sources got overwritten. */ 3284 if (inst->dst.file == GRF) { 3285 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) { 3286 if (last_mrf_move[i] && 3287 last_mrf_move[i]->src[0].reg == inst->dst.reg) { 3288 last_mrf_move[i] = NULL; 3289 } 3290 } 3291 } 3292 3293 if (inst->opcode == BRW_OPCODE_MOV && 3294 inst->dst.file == MRF && 3295 inst->src[0].file == GRF && 3296 !inst->predicated) { 3297 last_mrf_move[inst->dst.hw_reg] = inst; 3298 } 3299 } 3300 3301 return progress; 3302} 3303 3304bool 3305fs_visitor::virtual_grf_interferes(int a, int b) 3306{ 3307 int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]); 3308 int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]); 3309 3310 /* We can't handle dead register writes here, without iterating 3311 * over the whole instruction stream to find every single dead 3312 * write to that register to compare to the live interval of the 3313 * other register. Just assert that dead_code_eliminate() has been 3314 * called. 3315 */ 3316 assert((this->virtual_grf_use[a] != -1 || 3317 this->virtual_grf_def[a] == MAX_INSTRUCTION) && 3318 (this->virtual_grf_use[b] != -1 || 3319 this->virtual_grf_def[b] == MAX_INSTRUCTION)); 3320 3321 return start < end; 3322} 3323 3324static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg) 3325{ 3326 struct brw_reg brw_reg; 3327 3328 switch (reg->file) { 3329 case GRF: 3330 case ARF: 3331 case MRF: 3332 if (reg->smear == -1) { 3333 brw_reg = brw_vec8_reg(reg->file, 3334 reg->hw_reg, 0); 3335 } else { 3336 brw_reg = brw_vec1_reg(reg->file, 3337 reg->hw_reg, reg->smear); 3338 } 3339 brw_reg = retype(brw_reg, reg->type); 3340 break; 3341 case IMM: 3342 switch (reg->type) { 3343 case BRW_REGISTER_TYPE_F: 3344 brw_reg = brw_imm_f(reg->imm.f); 3345 break; 3346 case BRW_REGISTER_TYPE_D: 3347 brw_reg = brw_imm_d(reg->imm.i); 3348 break; 3349 case BRW_REGISTER_TYPE_UD: 3350 brw_reg = brw_imm_ud(reg->imm.u); 3351 break; 3352 default: 3353 assert(!"not reached"); 3354 brw_reg = brw_null_reg(); 3355 break; 3356 } 3357 break; 3358 case FIXED_HW_REG: 3359 brw_reg = reg->fixed_hw_reg; 3360 break; 3361 case BAD_FILE: 3362 /* Probably unused. */ 3363 brw_reg = brw_null_reg(); 3364 break; 3365 case UNIFORM: 3366 assert(!"not reached"); 3367 brw_reg = brw_null_reg(); 3368 break; 3369 default: 3370 assert(!"not reached"); 3371 brw_reg = brw_null_reg(); 3372 break; 3373 } 3374 if (reg->abs) 3375 brw_reg = brw_abs(brw_reg); 3376 if (reg->negate) 3377 brw_reg = negate(brw_reg); 3378 3379 return brw_reg; 3380} 3381 3382void 3383fs_visitor::generate_code() 3384{ 3385 int last_native_inst = 0; 3386 const char *last_annotation_string = NULL; 3387 ir_instruction *last_annotation_ir = NULL; 3388 3389 int if_stack_array_size = 16; 3390 int loop_stack_array_size = 16; 3391 int if_stack_depth = 0, loop_stack_depth = 0; 3392 brw_instruction **if_stack = 3393 rzalloc_array(this->mem_ctx, brw_instruction *, if_stack_array_size); 3394 brw_instruction **loop_stack = 3395 rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size); 3396 int *if_depth_in_loop = 3397 rzalloc_array(this->mem_ctx, int, loop_stack_array_size); 3398 3399 3400 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3401 printf("Native code for fragment shader %d:\n", 3402 ctx->Shader.CurrentFragmentProgram->Name); 3403 } 3404 3405 foreach_iter(exec_list_iterator, iter, this->instructions) { 3406 fs_inst *inst = (fs_inst *)iter.get(); 3407 struct brw_reg src[3], dst; 3408 3409 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3410 if (last_annotation_ir != inst->ir) { 3411 last_annotation_ir = inst->ir; 3412 if (last_annotation_ir) { 3413 printf(" "); 3414 last_annotation_ir->print(); 3415 printf("\n"); 3416 } 3417 } 3418 if (last_annotation_string != inst->annotation) { 3419 last_annotation_string = inst->annotation; 3420 if (last_annotation_string) 3421 printf(" %s\n", last_annotation_string); 3422 } 3423 } 3424 3425 for (unsigned int i = 0; i < 3; i++) { 3426 src[i] = brw_reg_from_fs_reg(&inst->src[i]); 3427 } 3428 dst = brw_reg_from_fs_reg(&inst->dst); 3429 3430 brw_set_conditionalmod(p, inst->conditional_mod); 3431 brw_set_predicate_control(p, inst->predicated); 3432 brw_set_saturate(p, inst->saturate); 3433 3434 switch (inst->opcode) { 3435 case BRW_OPCODE_MOV: 3436 brw_MOV(p, dst, src[0]); 3437 break; 3438 case BRW_OPCODE_ADD: 3439 brw_ADD(p, dst, src[0], src[1]); 3440 break; 3441 case BRW_OPCODE_MUL: 3442 brw_MUL(p, dst, src[0], src[1]); 3443 break; 3444 3445 case BRW_OPCODE_FRC: 3446 brw_FRC(p, dst, src[0]); 3447 break; 3448 case BRW_OPCODE_RNDD: 3449 brw_RNDD(p, dst, src[0]); 3450 break; 3451 case BRW_OPCODE_RNDE: 3452 brw_RNDE(p, dst, src[0]); 3453 break; 3454 case BRW_OPCODE_RNDZ: 3455 brw_RNDZ(p, dst, src[0]); 3456 break; 3457 3458 case BRW_OPCODE_AND: 3459 brw_AND(p, dst, src[0], src[1]); 3460 break; 3461 case BRW_OPCODE_OR: 3462 brw_OR(p, dst, src[0], src[1]); 3463 break; 3464 case BRW_OPCODE_XOR: 3465 brw_XOR(p, dst, src[0], src[1]); 3466 break; 3467 case BRW_OPCODE_NOT: 3468 brw_NOT(p, dst, src[0]); 3469 break; 3470 case BRW_OPCODE_ASR: 3471 brw_ASR(p, dst, src[0], src[1]); 3472 break; 3473 case BRW_OPCODE_SHR: 3474 brw_SHR(p, dst, src[0], src[1]); 3475 break; 3476 case BRW_OPCODE_SHL: 3477 brw_SHL(p, dst, src[0], src[1]); 3478 break; 3479 3480 case BRW_OPCODE_CMP: 3481 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); 3482 break; 3483 case BRW_OPCODE_SEL: 3484 brw_SEL(p, dst, src[0], src[1]); 3485 break; 3486 3487 case BRW_OPCODE_IF: 3488 if (inst->src[0].file != BAD_FILE) { 3489 assert(intel->gen >= 6); 3490 if_stack[if_stack_depth] = gen6_IF(p, inst->conditional_mod, src[0], src[1]); 3491 } else { 3492 if_stack[if_stack_depth] = brw_IF(p, BRW_EXECUTE_8); 3493 } 3494 if_depth_in_loop[loop_stack_depth]++; 3495 if_stack_depth++; 3496 if (if_stack_array_size <= if_stack_depth) { 3497 if_stack_array_size *= 2; 3498 if_stack = reralloc(this->mem_ctx, if_stack, brw_instruction *, 3499 if_stack_array_size); 3500 } 3501 break; 3502 3503 case BRW_OPCODE_ELSE: 3504 if_stack[if_stack_depth - 1] = 3505 brw_ELSE(p, if_stack[if_stack_depth - 1]); 3506 break; 3507 case BRW_OPCODE_ENDIF: 3508 if_stack_depth--; 3509 brw_ENDIF(p , if_stack[if_stack_depth]); 3510 if_depth_in_loop[loop_stack_depth]--; 3511 break; 3512 3513 case BRW_OPCODE_DO: 3514 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8); 3515 if (loop_stack_array_size <= loop_stack_depth) { 3516 loop_stack_array_size *= 2; 3517 loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *, 3518 loop_stack_array_size); 3519 if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int, 3520 loop_stack_array_size); 3521 } 3522 if_depth_in_loop[loop_stack_depth] = 0; 3523 break; 3524 3525 case BRW_OPCODE_BREAK: 3526 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]); 3527 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 3528 break; 3529 case BRW_OPCODE_CONTINUE: 3530 /* FINISHME: We need to write the loop instruction support still. */ 3531 if (intel->gen >= 6) 3532 gen6_CONT(p, loop_stack[loop_stack_depth - 1]); 3533 else 3534 brw_CONT(p, if_depth_in_loop[loop_stack_depth]); 3535 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 3536 break; 3537 3538 case BRW_OPCODE_WHILE: { 3539 struct brw_instruction *inst0, *inst1; 3540 GLuint br = 1; 3541 3542 if (intel->gen >= 5) 3543 br = 2; 3544 3545 assert(loop_stack_depth > 0); 3546 loop_stack_depth--; 3547 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]); 3548 if (intel->gen < 6) { 3549 /* patch all the BREAK/CONT instructions from last BGNLOOP */ 3550 while (inst0 > loop_stack[loop_stack_depth]) { 3551 inst0--; 3552 if (inst0->header.opcode == BRW_OPCODE_BREAK && 3553 inst0->bits3.if_else.jump_count == 0) { 3554 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1); 3555 } 3556 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE && 3557 inst0->bits3.if_else.jump_count == 0) { 3558 inst0->bits3.if_else.jump_count = br * (inst1 - inst0); 3559 } 3560 } 3561 } 3562 } 3563 break; 3564 3565 case FS_OPCODE_RCP: 3566 case FS_OPCODE_RSQ: 3567 case FS_OPCODE_SQRT: 3568 case FS_OPCODE_EXP2: 3569 case FS_OPCODE_LOG2: 3570 case FS_OPCODE_POW: 3571 case FS_OPCODE_SIN: 3572 case FS_OPCODE_COS: 3573 generate_math(inst, dst, src); 3574 break; 3575 case FS_OPCODE_CINTERP: 3576 brw_MOV(p, dst, src[0]); 3577 break; 3578 case FS_OPCODE_LINTERP: 3579 generate_linterp(inst, dst, src); 3580 break; 3581 case FS_OPCODE_TEX: 3582 case FS_OPCODE_TXB: 3583 case FS_OPCODE_TXD: 3584 case FS_OPCODE_TXL: 3585 generate_tex(inst, dst, src[0]); 3586 break; 3587 case FS_OPCODE_DISCARD_NOT: 3588 generate_discard_not(inst, dst); 3589 break; 3590 case FS_OPCODE_DISCARD_AND: 3591 generate_discard_and(inst, src[0]); 3592 break; 3593 case FS_OPCODE_DDX: 3594 generate_ddx(inst, dst, src[0]); 3595 break; 3596 case FS_OPCODE_DDY: 3597 generate_ddy(inst, dst, src[0]); 3598 break; 3599 3600 case FS_OPCODE_SPILL: 3601 generate_spill(inst, src[0]); 3602 break; 3603 3604 case FS_OPCODE_UNSPILL: 3605 generate_unspill(inst, dst); 3606 break; 3607 3608 case FS_OPCODE_PULL_CONSTANT_LOAD: 3609 generate_pull_constant_load(inst, dst); 3610 break; 3611 3612 case FS_OPCODE_FB_WRITE: 3613 generate_fb_write(inst); 3614 break; 3615 default: 3616 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) { 3617 _mesa_problem(ctx, "Unsupported opcode `%s' in FS", 3618 brw_opcodes[inst->opcode].name); 3619 } else { 3620 _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode); 3621 } 3622 fail("unsupported opcode in FS\n"); 3623 } 3624 3625 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3626 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) { 3627 if (0) { 3628 printf("0x%08x 0x%08x 0x%08x 0x%08x ", 3629 ((uint32_t *)&p->store[i])[3], 3630 ((uint32_t *)&p->store[i])[2], 3631 ((uint32_t *)&p->store[i])[1], 3632 ((uint32_t *)&p->store[i])[0]); 3633 } 3634 brw_disasm(stdout, &p->store[i], intel->gen); 3635 } 3636 } 3637 3638 last_native_inst = p->nr_insn; 3639 } 3640 3641 ralloc_free(if_stack); 3642 ralloc_free(loop_stack); 3643 ralloc_free(if_depth_in_loop); 3644 3645 brw_set_uip_jip(p); 3646 3647 /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS 3648 * emit issues, it doesn't get the jump distances into the output, 3649 * which is often something we want to debug. So this is here in 3650 * case you're doing that. 3651 */ 3652 if (0) { 3653 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3654 for (unsigned int i = 0; i < p->nr_insn; i++) { 3655 printf("0x%08x 0x%08x 0x%08x 0x%08x ", 3656 ((uint32_t *)&p->store[i])[3], 3657 ((uint32_t *)&p->store[i])[2], 3658 ((uint32_t *)&p->store[i])[1], 3659 ((uint32_t *)&p->store[i])[0]); 3660 brw_disasm(stdout, &p->store[i], intel->gen); 3661 } 3662 } 3663 } 3664} 3665 3666GLboolean 3667brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c) 3668{ 3669 struct intel_context *intel = &brw->intel; 3670 struct gl_context *ctx = &intel->ctx; 3671 struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram; 3672 3673 if (!prog) 3674 return GL_FALSE; 3675 3676 struct brw_shader *shader = 3677 (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; 3678 if (!shader) 3679 return GL_FALSE; 3680 3681 /* We always use 8-wide mode, at least for now. For one, flow 3682 * control only works in 8-wide. Also, when we're fragment shader 3683 * bound, we're almost always under register pressure as well, so 3684 * 8-wide would save us from the performance cliff of spilling 3685 * regs. 3686 */ 3687 c->dispatch_width = 8; 3688 3689 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3690 printf("GLSL IR for native fragment shader %d:\n", prog->Name); 3691 _mesa_print_ir(shader->ir, NULL); 3692 printf("\n"); 3693 } 3694 3695 /* Now the main event: Visit the shader IR and generate our FS IR for it. 3696 */ 3697 fs_visitor v(c, shader); 3698 3699 if (0) { 3700 v.emit_dummy_fs(); 3701 } else { 3702 v.calculate_urb_setup(); 3703 if (intel->gen < 6) 3704 v.emit_interpolation_setup_gen4(); 3705 else 3706 v.emit_interpolation_setup_gen6(); 3707 3708 /* Generate FS IR for main(). (the visitor only descends into 3709 * functions called "main"). 3710 */ 3711 foreach_iter(exec_list_iterator, iter, *shader->ir) { 3712 ir_instruction *ir = (ir_instruction *)iter.get(); 3713 v.base_ir = ir; 3714 ir->accept(&v); 3715 } 3716 3717 v.emit_fb_writes(); 3718 3719 v.split_virtual_grfs(); 3720 3721 v.setup_paramvalues_refs(); 3722 v.setup_pull_constants(); 3723 3724 bool progress; 3725 do { 3726 progress = false; 3727 3728 progress = v.remove_duplicate_mrf_writes() || progress; 3729 3730 progress = v.propagate_constants() || progress; 3731 progress = v.register_coalesce() || progress; 3732 progress = v.compute_to_mrf() || progress; 3733 progress = v.dead_code_eliminate() || progress; 3734 } while (progress); 3735 3736 v.schedule_instructions(); 3737 3738 v.assign_curb_setup(); 3739 v.assign_urb_setup(); 3740 3741 if (0) { 3742 /* Debug of register spilling: Go spill everything. */ 3743 int virtual_grf_count = v.virtual_grf_next; 3744 for (int i = 1; i < virtual_grf_count; i++) { 3745 v.spill_reg(i); 3746 } 3747 } 3748 3749 if (0) 3750 v.assign_regs_trivial(); 3751 else { 3752 while (!v.assign_regs()) { 3753 if (v.failed) 3754 break; 3755 } 3756 } 3757 } 3758 3759 if (!v.failed) 3760 v.generate_code(); 3761 3762 assert(!v.failed); /* FINISHME: Cleanly fail, tested at link time, etc. */ 3763 3764 if (v.failed) 3765 return GL_FALSE; 3766 3767 c->prog_data.total_grf = v.grf_used; 3768 3769 return GL_TRUE; 3770} 3771