brw_fs.cpp revision 1991d92207cf629ba4ceead4bfc3f768d7b9e402
1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Eric Anholt <eric@anholt.net> 25 * 26 */ 27 28extern "C" { 29 30#include <sys/types.h> 31 32#include "main/macros.h" 33#include "main/shaderobj.h" 34#include "main/uniforms.h" 35#include "program/prog_parameter.h" 36#include "program/prog_print.h" 37#include "program/prog_optimize.h" 38#include "program/register_allocate.h" 39#include "program/sampler.h" 40#include "program/hash_table.h" 41#include "brw_context.h" 42#include "brw_eu.h" 43#include "brw_wm.h" 44#include "talloc.h" 45} 46#include "brw_fs.h" 47#include "../glsl/glsl_types.h" 48#include "../glsl/ir_optimization.h" 49#include "../glsl/ir_print_visitor.h" 50 51#define MAX_INSTRUCTION (1 << 30) 52static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg); 53 54struct gl_shader * 55brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type) 56{ 57 struct brw_shader *shader; 58 59 shader = talloc_zero(NULL, struct brw_shader); 60 if (shader) { 61 shader->base.Type = type; 62 shader->base.Name = name; 63 _mesa_init_shader(ctx, &shader->base); 64 } 65 66 return &shader->base; 67} 68 69struct gl_shader_program * 70brw_new_shader_program(struct gl_context *ctx, GLuint name) 71{ 72 struct brw_shader_program *prog; 73 prog = talloc_zero(NULL, struct brw_shader_program); 74 if (prog) { 75 prog->base.Name = name; 76 _mesa_init_shader_program(ctx, &prog->base); 77 } 78 return &prog->base; 79} 80 81GLboolean 82brw_compile_shader(struct gl_context *ctx, struct gl_shader *shader) 83{ 84 if (!_mesa_ir_compile_shader(ctx, shader)) 85 return GL_FALSE; 86 87 return GL_TRUE; 88} 89 90GLboolean 91brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog) 92{ 93 struct brw_context *brw = brw_context(ctx); 94 struct intel_context *intel = &brw->intel; 95 96 struct brw_shader *shader = 97 (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; 98 if (shader != NULL) { 99 void *mem_ctx = talloc_new(NULL); 100 bool progress; 101 102 if (shader->ir) 103 talloc_free(shader->ir); 104 shader->ir = new(shader) exec_list; 105 clone_ir_list(mem_ctx, shader->ir, shader->base.ir); 106 107 do_mat_op_to_vec(shader->ir); 108 lower_instructions(shader->ir, 109 MOD_TO_FRACT | 110 DIV_TO_MUL_RCP | 111 SUB_TO_ADD_NEG | 112 EXP_TO_EXP2 | 113 LOG_TO_LOG2); 114 115 /* Pre-gen6 HW can only nest if-statements 16 deep. Beyond this, 116 * if-statements need to be flattened. 117 */ 118 if (intel->gen < 6) 119 lower_if_to_cond_assign(shader->ir, 16); 120 121 do_lower_texture_projection(shader->ir); 122 do_vec_index_to_cond_assign(shader->ir); 123 brw_do_cubemap_normalize(shader->ir); 124 125 do { 126 progress = false; 127 128 brw_do_channel_expressions(shader->ir); 129 brw_do_vector_splitting(shader->ir); 130 131 progress = do_lower_jumps(shader->ir, true, true, 132 true, /* main return */ 133 false, /* continue */ 134 false /* loops */ 135 ) || progress; 136 137 progress = do_common_optimization(shader->ir, true, 32) || progress; 138 139 progress = lower_noise(shader->ir) || progress; 140 progress = 141 lower_variable_index_to_cond_assign(shader->ir, 142 GL_TRUE, /* input */ 143 GL_TRUE, /* output */ 144 GL_TRUE, /* temp */ 145 GL_TRUE /* uniform */ 146 ) || progress; 147 progress = lower_quadop_vector(shader->ir, false) || progress; 148 } while (progress); 149 150 validate_ir_tree(shader->ir); 151 152 reparent_ir(shader->ir, shader->ir); 153 talloc_free(mem_ctx); 154 } 155 156 if (!_mesa_ir_link_shader(ctx, prog)) 157 return GL_FALSE; 158 159 return GL_TRUE; 160} 161 162static int 163type_size(const struct glsl_type *type) 164{ 165 unsigned int size, i; 166 167 switch (type->base_type) { 168 case GLSL_TYPE_UINT: 169 case GLSL_TYPE_INT: 170 case GLSL_TYPE_FLOAT: 171 case GLSL_TYPE_BOOL: 172 return type->components(); 173 case GLSL_TYPE_ARRAY: 174 return type_size(type->fields.array) * type->length; 175 case GLSL_TYPE_STRUCT: 176 size = 0; 177 for (i = 0; i < type->length; i++) { 178 size += type_size(type->fields.structure[i].type); 179 } 180 return size; 181 case GLSL_TYPE_SAMPLER: 182 /* Samplers take up no register space, since they're baked in at 183 * link time. 184 */ 185 return 0; 186 default: 187 assert(!"not reached"); 188 return 0; 189 } 190} 191 192/** 193 * Returns how many MRFs an FS opcode will write over. 194 * 195 * Note that this is not the 0 or 1 implied writes in an actual gen 196 * instruction -- the FS opcodes often generate MOVs in addition. 197 */ 198int 199fs_visitor::implied_mrf_writes(fs_inst *inst) 200{ 201 if (inst->mlen == 0) 202 return 0; 203 204 switch (inst->opcode) { 205 case FS_OPCODE_RCP: 206 case FS_OPCODE_RSQ: 207 case FS_OPCODE_SQRT: 208 case FS_OPCODE_EXP2: 209 case FS_OPCODE_LOG2: 210 case FS_OPCODE_SIN: 211 case FS_OPCODE_COS: 212 return 1; 213 case FS_OPCODE_POW: 214 return 2; 215 case FS_OPCODE_TEX: 216 case FS_OPCODE_TXB: 217 case FS_OPCODE_TXL: 218 return 1; 219 case FS_OPCODE_FB_WRITE: 220 return 2; 221 case FS_OPCODE_PULL_CONSTANT_LOAD: 222 case FS_OPCODE_UNSPILL: 223 return 1; 224 case FS_OPCODE_SPILL: 225 return 2; 226 default: 227 assert(!"not reached"); 228 return inst->mlen; 229 } 230} 231 232int 233fs_visitor::virtual_grf_alloc(int size) 234{ 235 if (virtual_grf_array_size <= virtual_grf_next) { 236 if (virtual_grf_array_size == 0) 237 virtual_grf_array_size = 16; 238 else 239 virtual_grf_array_size *= 2; 240 virtual_grf_sizes = talloc_realloc(mem_ctx, virtual_grf_sizes, 241 int, virtual_grf_array_size); 242 243 /* This slot is always unused. */ 244 virtual_grf_sizes[0] = 0; 245 } 246 virtual_grf_sizes[virtual_grf_next] = size; 247 return virtual_grf_next++; 248} 249 250/** Fixed HW reg constructor. */ 251fs_reg::fs_reg(enum register_file file, int hw_reg) 252{ 253 init(); 254 this->file = file; 255 this->hw_reg = hw_reg; 256 this->type = BRW_REGISTER_TYPE_F; 257} 258 259/** Fixed HW reg constructor. */ 260fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type) 261{ 262 init(); 263 this->file = file; 264 this->hw_reg = hw_reg; 265 this->type = type; 266} 267 268int 269brw_type_for_base_type(const struct glsl_type *type) 270{ 271 switch (type->base_type) { 272 case GLSL_TYPE_FLOAT: 273 return BRW_REGISTER_TYPE_F; 274 case GLSL_TYPE_INT: 275 case GLSL_TYPE_BOOL: 276 return BRW_REGISTER_TYPE_D; 277 case GLSL_TYPE_UINT: 278 return BRW_REGISTER_TYPE_UD; 279 case GLSL_TYPE_ARRAY: 280 case GLSL_TYPE_STRUCT: 281 case GLSL_TYPE_SAMPLER: 282 /* These should be overridden with the type of the member when 283 * dereferenced into. BRW_REGISTER_TYPE_UD seems like a likely 284 * way to trip up if we don't. 285 */ 286 return BRW_REGISTER_TYPE_UD; 287 default: 288 assert(!"not reached"); 289 return BRW_REGISTER_TYPE_F; 290 } 291} 292 293/** Automatic reg constructor. */ 294fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type) 295{ 296 init(); 297 298 this->file = GRF; 299 this->reg = v->virtual_grf_alloc(type_size(type)); 300 this->reg_offset = 0; 301 this->type = brw_type_for_base_type(type); 302} 303 304fs_reg * 305fs_visitor::variable_storage(ir_variable *var) 306{ 307 return (fs_reg *)hash_table_find(this->variable_ht, var); 308} 309 310/* Our support for uniforms is piggy-backed on the struct 311 * gl_fragment_program, because that's where the values actually 312 * get stored, rather than in some global gl_shader_program uniform 313 * store. 314 */ 315int 316fs_visitor::setup_uniform_values(int loc, const glsl_type *type) 317{ 318 unsigned int offset = 0; 319 320 if (type->is_matrix()) { 321 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT, 322 type->vector_elements, 323 1); 324 325 for (unsigned int i = 0; i < type->matrix_columns; i++) { 326 offset += setup_uniform_values(loc + offset, column); 327 } 328 329 return offset; 330 } 331 332 switch (type->base_type) { 333 case GLSL_TYPE_FLOAT: 334 case GLSL_TYPE_UINT: 335 case GLSL_TYPE_INT: 336 case GLSL_TYPE_BOOL: 337 for (unsigned int i = 0; i < type->vector_elements; i++) { 338 unsigned int param = c->prog_data.nr_params++; 339 340 assert(param < ARRAY_SIZE(c->prog_data.param)); 341 342 switch (type->base_type) { 343 case GLSL_TYPE_FLOAT: 344 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 345 break; 346 case GLSL_TYPE_UINT: 347 c->prog_data.param_convert[param] = PARAM_CONVERT_F2U; 348 break; 349 case GLSL_TYPE_INT: 350 c->prog_data.param_convert[param] = PARAM_CONVERT_F2I; 351 break; 352 case GLSL_TYPE_BOOL: 353 c->prog_data.param_convert[param] = PARAM_CONVERT_F2B; 354 break; 355 default: 356 assert(!"not reached"); 357 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 358 break; 359 } 360 this->param_index[param] = loc; 361 this->param_offset[param] = i; 362 } 363 return 1; 364 365 case GLSL_TYPE_STRUCT: 366 for (unsigned int i = 0; i < type->length; i++) { 367 offset += setup_uniform_values(loc + offset, 368 type->fields.structure[i].type); 369 } 370 return offset; 371 372 case GLSL_TYPE_ARRAY: 373 for (unsigned int i = 0; i < type->length; i++) { 374 offset += setup_uniform_values(loc + offset, type->fields.array); 375 } 376 return offset; 377 378 case GLSL_TYPE_SAMPLER: 379 /* The sampler takes up a slot, but we don't use any values from it. */ 380 return 1; 381 382 default: 383 assert(!"not reached"); 384 return 0; 385 } 386} 387 388 389/* Our support for builtin uniforms is even scarier than non-builtin. 390 * It sits on top of the PROG_STATE_VAR parameters that are 391 * automatically updated from GL context state. 392 */ 393void 394fs_visitor::setup_builtin_uniform_values(ir_variable *ir) 395{ 396 const struct gl_builtin_uniform_desc *statevar = NULL; 397 398 for (unsigned int i = 0; _mesa_builtin_uniform_desc[i].name; i++) { 399 statevar = &_mesa_builtin_uniform_desc[i]; 400 if (strcmp(ir->name, _mesa_builtin_uniform_desc[i].name) == 0) 401 break; 402 } 403 404 if (!statevar->name) { 405 this->fail = true; 406 printf("Failed to find builtin uniform `%s'\n", ir->name); 407 return; 408 } 409 410 int array_count; 411 if (ir->type->is_array()) { 412 array_count = ir->type->length; 413 } else { 414 array_count = 1; 415 } 416 417 for (int a = 0; a < array_count; a++) { 418 for (unsigned int i = 0; i < statevar->num_elements; i++) { 419 struct gl_builtin_uniform_element *element = &statevar->elements[i]; 420 int tokens[STATE_LENGTH]; 421 422 memcpy(tokens, element->tokens, sizeof(element->tokens)); 423 if (ir->type->is_array()) { 424 tokens[1] = a; 425 } 426 427 /* This state reference has already been setup by ir_to_mesa, 428 * but we'll get the same index back here. 429 */ 430 int index = _mesa_add_state_reference(this->fp->Base.Parameters, 431 (gl_state_index *)tokens); 432 433 /* Add each of the unique swizzles of the element as a 434 * parameter. This'll end up matching the expected layout of 435 * the array/matrix/structure we're trying to fill in. 436 */ 437 int last_swiz = -1; 438 for (unsigned int i = 0; i < 4; i++) { 439 int swiz = GET_SWZ(element->swizzle, i); 440 if (swiz == last_swiz) 441 break; 442 last_swiz = swiz; 443 444 c->prog_data.param_convert[c->prog_data.nr_params] = 445 PARAM_NO_CONVERT; 446 this->param_index[c->prog_data.nr_params] = index; 447 this->param_offset[c->prog_data.nr_params] = swiz; 448 c->prog_data.nr_params++; 449 } 450 } 451 } 452} 453 454fs_reg * 455fs_visitor::emit_fragcoord_interpolation(ir_variable *ir) 456{ 457 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 458 fs_reg wpos = *reg; 459 fs_reg neg_y = this->pixel_y; 460 neg_y.negate = true; 461 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo; 462 463 /* gl_FragCoord.x */ 464 if (ir->pixel_center_integer) { 465 emit(fs_inst(BRW_OPCODE_MOV, wpos, this->pixel_x)); 466 } else { 467 emit(fs_inst(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f))); 468 } 469 wpos.reg_offset++; 470 471 /* gl_FragCoord.y */ 472 if (!flip && ir->pixel_center_integer) { 473 emit(fs_inst(BRW_OPCODE_MOV, wpos, this->pixel_y)); 474 } else { 475 fs_reg pixel_y = this->pixel_y; 476 float offset = (ir->pixel_center_integer ? 0.0 : 0.5); 477 478 if (flip) { 479 pixel_y.negate = true; 480 offset += c->key.drawable_height - 1.0; 481 } 482 483 emit(fs_inst(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset))); 484 } 485 wpos.reg_offset++; 486 487 /* gl_FragCoord.z */ 488 if (intel->gen >= 6) { 489 emit(fs_inst(BRW_OPCODE_MOV, wpos, 490 fs_reg(brw_vec8_grf(c->source_depth_reg, 0)))); 491 } else { 492 emit(fs_inst(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y, 493 interp_reg(FRAG_ATTRIB_WPOS, 2))); 494 } 495 wpos.reg_offset++; 496 497 /* gl_FragCoord.w: Already set up in emit_interpolation */ 498 emit(fs_inst(BRW_OPCODE_MOV, wpos, this->wpos_w)); 499 500 return reg; 501} 502 503fs_reg * 504fs_visitor::emit_general_interpolation(ir_variable *ir) 505{ 506 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 507 /* Interpolation is always in floating point regs. */ 508 reg->type = BRW_REGISTER_TYPE_F; 509 fs_reg attr = *reg; 510 511 unsigned int array_elements; 512 const glsl_type *type; 513 514 if (ir->type->is_array()) { 515 array_elements = ir->type->length; 516 if (array_elements == 0) { 517 this->fail = true; 518 } 519 type = ir->type->fields.array; 520 } else { 521 array_elements = 1; 522 type = ir->type; 523 } 524 525 int location = ir->location; 526 for (unsigned int i = 0; i < array_elements; i++) { 527 for (unsigned int j = 0; j < type->matrix_columns; j++) { 528 if (urb_setup[location] == -1) { 529 /* If there's no incoming setup data for this slot, don't 530 * emit interpolation for it. 531 */ 532 attr.reg_offset += type->vector_elements; 533 location++; 534 continue; 535 } 536 537 if (c->key.flat_shade && (location == FRAG_ATTRIB_COL0 || 538 location == FRAG_ATTRIB_COL1)) { 539 /* Constant interpolation (flat shading) case. The SF has 540 * handed us defined values in only the constant offset 541 * field of the setup reg. 542 */ 543 for (unsigned int c = 0; c < type->vector_elements; c++) { 544 struct brw_reg interp = interp_reg(location, c); 545 interp = suboffset(interp, 3); 546 emit(fs_inst(FS_OPCODE_CINTERP, attr, fs_reg(interp))); 547 attr.reg_offset++; 548 } 549 } else { 550 /* Perspective interpolation case. */ 551 for (unsigned int c = 0; c < type->vector_elements; c++) { 552 struct brw_reg interp = interp_reg(location, c); 553 emit(fs_inst(FS_OPCODE_LINTERP, 554 attr, 555 this->delta_x, 556 this->delta_y, 557 fs_reg(interp))); 558 attr.reg_offset++; 559 } 560 561 if (intel->gen < 6) { 562 attr.reg_offset -= type->vector_elements; 563 for (unsigned int c = 0; c < type->vector_elements; c++) { 564 emit(fs_inst(BRW_OPCODE_MUL, 565 attr, 566 attr, 567 this->pixel_w)); 568 attr.reg_offset++; 569 } 570 } 571 } 572 location++; 573 } 574 } 575 576 return reg; 577} 578 579fs_reg * 580fs_visitor::emit_frontfacing_interpolation(ir_variable *ir) 581{ 582 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 583 584 /* The frontfacing comes in as a bit in the thread payload. */ 585 if (intel->gen >= 6) { 586 emit(fs_inst(BRW_OPCODE_ASR, 587 *reg, 588 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)), 589 fs_reg(15))); 590 emit(fs_inst(BRW_OPCODE_NOT, 591 *reg, 592 *reg)); 593 emit(fs_inst(BRW_OPCODE_AND, 594 *reg, 595 *reg, 596 fs_reg(1))); 597 } else { 598 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); 599 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives 600 * us front face 601 */ 602 fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, 603 *reg, 604 fs_reg(r1_6ud), 605 fs_reg(1u << 31))); 606 inst->conditional_mod = BRW_CONDITIONAL_L; 607 emit(fs_inst(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u))); 608 } 609 610 return reg; 611} 612 613fs_inst * 614fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src) 615{ 616 switch (opcode) { 617 case FS_OPCODE_RCP: 618 case FS_OPCODE_RSQ: 619 case FS_OPCODE_SQRT: 620 case FS_OPCODE_EXP2: 621 case FS_OPCODE_LOG2: 622 case FS_OPCODE_SIN: 623 case FS_OPCODE_COS: 624 break; 625 default: 626 assert(!"not reached: bad math opcode"); 627 return NULL; 628 } 629 630 /* Can't do hstride == 0 args to gen6 math, so expand it out. We 631 * might be able to do better by doing execsize = 1 math and then 632 * expanding that result out, but we would need to be careful with 633 * masking. 634 * 635 * The hardware ignores source modifiers (negate and abs) on math 636 * instructions, so we also move to a temp to set those up. 637 */ 638 if (intel->gen >= 6 && (src.file == UNIFORM || 639 src.abs || 640 src.negate)) { 641 fs_reg expanded = fs_reg(this, glsl_type::float_type); 642 emit(fs_inst(BRW_OPCODE_MOV, expanded, src)); 643 src = expanded; 644 } 645 646 fs_inst *inst = emit(fs_inst(opcode, dst, src)); 647 648 if (intel->gen < 6) { 649 inst->base_mrf = 2; 650 inst->mlen = 1; 651 } 652 653 return inst; 654} 655 656fs_inst * 657fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1) 658{ 659 int base_mrf = 2; 660 fs_inst *inst; 661 662 assert(opcode == FS_OPCODE_POW); 663 664 if (intel->gen >= 6) { 665 /* Can't do hstride == 0 args to gen6 math, so expand it out. */ 666 if (src0.file == UNIFORM) { 667 fs_reg expanded = fs_reg(this, glsl_type::float_type); 668 emit(fs_inst(BRW_OPCODE_MOV, expanded, src0)); 669 src0 = expanded; 670 } 671 672 if (src1.file == UNIFORM) { 673 fs_reg expanded = fs_reg(this, glsl_type::float_type); 674 emit(fs_inst(BRW_OPCODE_MOV, expanded, src1)); 675 src1 = expanded; 676 } 677 678 inst = emit(fs_inst(opcode, dst, src0, src1)); 679 } else { 680 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1)); 681 inst = emit(fs_inst(opcode, dst, src0, reg_null_f)); 682 683 inst->base_mrf = base_mrf; 684 inst->mlen = 2; 685 } 686 return inst; 687} 688 689void 690fs_visitor::visit(ir_variable *ir) 691{ 692 fs_reg *reg = NULL; 693 694 if (variable_storage(ir)) 695 return; 696 697 if (strcmp(ir->name, "gl_FragColor") == 0) { 698 this->frag_color = ir; 699 } else if (strcmp(ir->name, "gl_FragData") == 0) { 700 this->frag_data = ir; 701 } else if (strcmp(ir->name, "gl_FragDepth") == 0) { 702 this->frag_depth = ir; 703 } 704 705 if (ir->mode == ir_var_in) { 706 if (!strcmp(ir->name, "gl_FragCoord")) { 707 reg = emit_fragcoord_interpolation(ir); 708 } else if (!strcmp(ir->name, "gl_FrontFacing")) { 709 reg = emit_frontfacing_interpolation(ir); 710 } else { 711 reg = emit_general_interpolation(ir); 712 } 713 assert(reg); 714 hash_table_insert(this->variable_ht, reg, ir); 715 return; 716 } 717 718 if (ir->mode == ir_var_uniform) { 719 int param_index = c->prog_data.nr_params; 720 721 if (!strncmp(ir->name, "gl_", 3)) { 722 setup_builtin_uniform_values(ir); 723 } else { 724 setup_uniform_values(ir->location, ir->type); 725 } 726 727 reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index); 728 reg->type = brw_type_for_base_type(ir->type); 729 } 730 731 if (!reg) 732 reg = new(this->mem_ctx) fs_reg(this, ir->type); 733 734 hash_table_insert(this->variable_ht, reg, ir); 735} 736 737void 738fs_visitor::visit(ir_dereference_variable *ir) 739{ 740 fs_reg *reg = variable_storage(ir->var); 741 this->result = *reg; 742} 743 744void 745fs_visitor::visit(ir_dereference_record *ir) 746{ 747 const glsl_type *struct_type = ir->record->type; 748 749 ir->record->accept(this); 750 751 unsigned int offset = 0; 752 for (unsigned int i = 0; i < struct_type->length; i++) { 753 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0) 754 break; 755 offset += type_size(struct_type->fields.structure[i].type); 756 } 757 this->result.reg_offset += offset; 758 this->result.type = brw_type_for_base_type(ir->type); 759} 760 761void 762fs_visitor::visit(ir_dereference_array *ir) 763{ 764 ir_constant *index; 765 int element_size; 766 767 ir->array->accept(this); 768 index = ir->array_index->as_constant(); 769 770 element_size = type_size(ir->type); 771 this->result.type = brw_type_for_base_type(ir->type); 772 773 if (index) { 774 assert(this->result.file == UNIFORM || 775 (this->result.file == GRF && 776 this->result.reg != 0)); 777 this->result.reg_offset += index->value.i[0] * element_size; 778 } else { 779 assert(!"FINISHME: non-constant array element"); 780 } 781} 782 783/* Instruction selection: Produce a MOV.sat instead of 784 * MIN(MAX(val, 0), 1) when possible. 785 */ 786bool 787fs_visitor::try_emit_saturate(ir_expression *ir) 788{ 789 ir_rvalue *sat_val = ir->as_rvalue_to_saturate(); 790 791 if (!sat_val) 792 return false; 793 794 sat_val->accept(this); 795 fs_reg src = this->result; 796 797 this->result = fs_reg(this, ir->type); 798 fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, src)); 799 inst->saturate = true; 800 801 return true; 802} 803 804static uint32_t 805brw_conditional_for_comparison(unsigned int op) 806{ 807 switch (op) { 808 case ir_binop_less: 809 return BRW_CONDITIONAL_L; 810 case ir_binop_greater: 811 return BRW_CONDITIONAL_G; 812 case ir_binop_lequal: 813 return BRW_CONDITIONAL_LE; 814 case ir_binop_gequal: 815 return BRW_CONDITIONAL_GE; 816 case ir_binop_equal: 817 case ir_binop_all_equal: /* same as equal for scalars */ 818 return BRW_CONDITIONAL_Z; 819 case ir_binop_nequal: 820 case ir_binop_any_nequal: /* same as nequal for scalars */ 821 return BRW_CONDITIONAL_NZ; 822 default: 823 assert(!"not reached: bad operation for comparison"); 824 return BRW_CONDITIONAL_NZ; 825 } 826} 827 828void 829fs_visitor::visit(ir_expression *ir) 830{ 831 unsigned int operand; 832 fs_reg op[2], temp; 833 fs_inst *inst; 834 835 assert(ir->get_num_operands() <= 2); 836 837 if (try_emit_saturate(ir)) 838 return; 839 840 for (operand = 0; operand < ir->get_num_operands(); operand++) { 841 ir->operands[operand]->accept(this); 842 if (this->result.file == BAD_FILE) { 843 ir_print_visitor v; 844 printf("Failed to get tree for expression operand:\n"); 845 ir->operands[operand]->accept(&v); 846 this->fail = true; 847 } 848 op[operand] = this->result; 849 850 /* Matrix expression operands should have been broken down to vector 851 * operations already. 852 */ 853 assert(!ir->operands[operand]->type->is_matrix()); 854 /* And then those vector operands should have been broken down to scalar. 855 */ 856 assert(!ir->operands[operand]->type->is_vector()); 857 } 858 859 /* Storage for our result. If our result goes into an assignment, it will 860 * just get copy-propagated out, so no worries. 861 */ 862 this->result = fs_reg(this, ir->type); 863 864 switch (ir->operation) { 865 case ir_unop_logic_not: 866 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is 867 * ones complement of the whole register, not just bit 0. 868 */ 869 emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1))); 870 break; 871 case ir_unop_neg: 872 op[0].negate = !op[0].negate; 873 this->result = op[0]; 874 break; 875 case ir_unop_abs: 876 op[0].abs = true; 877 op[0].negate = false; 878 this->result = op[0]; 879 break; 880 case ir_unop_sign: 881 temp = fs_reg(this, ir->type); 882 883 emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(0.0f))); 884 885 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f))); 886 inst->conditional_mod = BRW_CONDITIONAL_G; 887 inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(1.0f))); 888 inst->predicated = true; 889 890 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f))); 891 inst->conditional_mod = BRW_CONDITIONAL_L; 892 inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f))); 893 inst->predicated = true; 894 895 break; 896 case ir_unop_rcp: 897 emit_math(FS_OPCODE_RCP, this->result, op[0]); 898 break; 899 900 case ir_unop_exp2: 901 emit_math(FS_OPCODE_EXP2, this->result, op[0]); 902 break; 903 case ir_unop_log2: 904 emit_math(FS_OPCODE_LOG2, this->result, op[0]); 905 break; 906 case ir_unop_exp: 907 case ir_unop_log: 908 assert(!"not reached: should be handled by ir_explog_to_explog2"); 909 break; 910 case ir_unop_sin: 911 case ir_unop_sin_reduced: 912 emit_math(FS_OPCODE_SIN, this->result, op[0]); 913 break; 914 case ir_unop_cos: 915 case ir_unop_cos_reduced: 916 emit_math(FS_OPCODE_COS, this->result, op[0]); 917 break; 918 919 case ir_unop_dFdx: 920 emit(fs_inst(FS_OPCODE_DDX, this->result, op[0])); 921 break; 922 case ir_unop_dFdy: 923 emit(fs_inst(FS_OPCODE_DDY, this->result, op[0])); 924 break; 925 926 case ir_binop_add: 927 emit(fs_inst(BRW_OPCODE_ADD, this->result, op[0], op[1])); 928 break; 929 case ir_binop_sub: 930 assert(!"not reached: should be handled by ir_sub_to_add_neg"); 931 break; 932 933 case ir_binop_mul: 934 emit(fs_inst(BRW_OPCODE_MUL, this->result, op[0], op[1])); 935 break; 936 case ir_binop_div: 937 assert(!"not reached: should be handled by ir_div_to_mul_rcp"); 938 break; 939 case ir_binop_mod: 940 assert(!"ir_binop_mod should have been converted to b * fract(a/b)"); 941 break; 942 943 case ir_binop_less: 944 case ir_binop_greater: 945 case ir_binop_lequal: 946 case ir_binop_gequal: 947 case ir_binop_equal: 948 case ir_binop_all_equal: 949 case ir_binop_nequal: 950 case ir_binop_any_nequal: 951 temp = this->result; 952 /* original gen4 does implicit conversion before comparison. */ 953 if (intel->gen < 5) 954 temp.type = op[0].type; 955 956 inst = emit(fs_inst(BRW_OPCODE_CMP, temp, op[0], op[1])); 957 inst->conditional_mod = brw_conditional_for_comparison(ir->operation); 958 emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1))); 959 break; 960 961 case ir_binop_logic_xor: 962 emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], op[1])); 963 break; 964 965 case ir_binop_logic_or: 966 emit(fs_inst(BRW_OPCODE_OR, this->result, op[0], op[1])); 967 break; 968 969 case ir_binop_logic_and: 970 emit(fs_inst(BRW_OPCODE_AND, this->result, op[0], op[1])); 971 break; 972 973 case ir_binop_dot: 974 case ir_unop_any: 975 assert(!"not reached: should be handled by brw_fs_channel_expressions"); 976 break; 977 978 case ir_unop_noise: 979 assert(!"not reached: should be handled by lower_noise"); 980 break; 981 982 case ir_quadop_vector: 983 assert(!"not reached: should be handled by lower_quadop_vector"); 984 break; 985 986 case ir_unop_sqrt: 987 emit_math(FS_OPCODE_SQRT, this->result, op[0]); 988 break; 989 990 case ir_unop_rsq: 991 emit_math(FS_OPCODE_RSQ, this->result, op[0]); 992 break; 993 994 case ir_unop_i2f: 995 case ir_unop_b2f: 996 case ir_unop_b2i: 997 case ir_unop_f2i: 998 emit(fs_inst(BRW_OPCODE_MOV, this->result, op[0])); 999 break; 1000 case ir_unop_f2b: 1001 case ir_unop_i2b: 1002 temp = this->result; 1003 /* original gen4 does implicit conversion before comparison. */ 1004 if (intel->gen < 5) 1005 temp.type = op[0].type; 1006 1007 inst = emit(fs_inst(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f))); 1008 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1009 inst = emit(fs_inst(BRW_OPCODE_AND, this->result, 1010 this->result, fs_reg(1))); 1011 break; 1012 1013 case ir_unop_trunc: 1014 emit(fs_inst(BRW_OPCODE_RNDZ, this->result, op[0])); 1015 break; 1016 case ir_unop_ceil: 1017 op[0].negate = !op[0].negate; 1018 inst = emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0])); 1019 this->result.negate = true; 1020 break; 1021 case ir_unop_floor: 1022 inst = emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0])); 1023 break; 1024 case ir_unop_fract: 1025 inst = emit(fs_inst(BRW_OPCODE_FRC, this->result, op[0])); 1026 break; 1027 case ir_unop_round_even: 1028 emit(fs_inst(BRW_OPCODE_RNDE, this->result, op[0])); 1029 break; 1030 1031 case ir_binop_min: 1032 inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1])); 1033 inst->conditional_mod = BRW_CONDITIONAL_L; 1034 1035 inst = emit(fs_inst(BRW_OPCODE_SEL, this->result, op[0], op[1])); 1036 inst->predicated = true; 1037 break; 1038 case ir_binop_max: 1039 inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1])); 1040 inst->conditional_mod = BRW_CONDITIONAL_G; 1041 1042 inst = emit(fs_inst(BRW_OPCODE_SEL, this->result, op[0], op[1])); 1043 inst->predicated = true; 1044 break; 1045 1046 case ir_binop_pow: 1047 emit_math(FS_OPCODE_POW, this->result, op[0], op[1]); 1048 break; 1049 1050 case ir_unop_bit_not: 1051 inst = emit(fs_inst(BRW_OPCODE_NOT, this->result, op[0])); 1052 break; 1053 case ir_binop_bit_and: 1054 inst = emit(fs_inst(BRW_OPCODE_AND, this->result, op[0], op[1])); 1055 break; 1056 case ir_binop_bit_xor: 1057 inst = emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], op[1])); 1058 break; 1059 case ir_binop_bit_or: 1060 inst = emit(fs_inst(BRW_OPCODE_OR, this->result, op[0], op[1])); 1061 break; 1062 1063 case ir_unop_u2f: 1064 case ir_binop_lshift: 1065 case ir_binop_rshift: 1066 assert(!"GLSL 1.30 features unsupported"); 1067 break; 1068 } 1069} 1070 1071void 1072fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r, 1073 const glsl_type *type, bool predicated) 1074{ 1075 switch (type->base_type) { 1076 case GLSL_TYPE_FLOAT: 1077 case GLSL_TYPE_UINT: 1078 case GLSL_TYPE_INT: 1079 case GLSL_TYPE_BOOL: 1080 for (unsigned int i = 0; i < type->components(); i++) { 1081 l.type = brw_type_for_base_type(type); 1082 r.type = brw_type_for_base_type(type); 1083 1084 fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, l, r)); 1085 inst->predicated = predicated; 1086 1087 l.reg_offset++; 1088 r.reg_offset++; 1089 } 1090 break; 1091 case GLSL_TYPE_ARRAY: 1092 for (unsigned int i = 0; i < type->length; i++) { 1093 emit_assignment_writes(l, r, type->fields.array, predicated); 1094 } 1095 break; 1096 1097 case GLSL_TYPE_STRUCT: 1098 for (unsigned int i = 0; i < type->length; i++) { 1099 emit_assignment_writes(l, r, type->fields.structure[i].type, 1100 predicated); 1101 } 1102 break; 1103 1104 case GLSL_TYPE_SAMPLER: 1105 break; 1106 1107 default: 1108 assert(!"not reached"); 1109 break; 1110 } 1111} 1112 1113void 1114fs_visitor::visit(ir_assignment *ir) 1115{ 1116 struct fs_reg l, r; 1117 fs_inst *inst; 1118 1119 /* FINISHME: arrays on the lhs */ 1120 ir->lhs->accept(this); 1121 l = this->result; 1122 1123 ir->rhs->accept(this); 1124 r = this->result; 1125 1126 assert(l.file != BAD_FILE); 1127 assert(r.file != BAD_FILE); 1128 1129 if (ir->condition) { 1130 emit_bool_to_cond_code(ir->condition); 1131 } 1132 1133 if (ir->lhs->type->is_scalar() || 1134 ir->lhs->type->is_vector()) { 1135 for (int i = 0; i < ir->lhs->type->vector_elements; i++) { 1136 if (ir->write_mask & (1 << i)) { 1137 inst = emit(fs_inst(BRW_OPCODE_MOV, l, r)); 1138 if (ir->condition) 1139 inst->predicated = true; 1140 r.reg_offset++; 1141 } 1142 l.reg_offset++; 1143 } 1144 } else { 1145 emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL); 1146 } 1147} 1148 1149fs_inst * 1150fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate) 1151{ 1152 int mlen; 1153 int base_mrf = 1; 1154 bool simd16 = false; 1155 fs_reg orig_dst; 1156 1157 /* g0 header. */ 1158 mlen = 1; 1159 1160 if (ir->shadow_comparitor) { 1161 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1162 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), 1163 coordinate)); 1164 coordinate.reg_offset++; 1165 } 1166 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */ 1167 mlen += 3; 1168 1169 if (ir->op == ir_tex) { 1170 /* There's no plain shadow compare message, so we use shadow 1171 * compare with a bias of 0.0. 1172 */ 1173 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), 1174 fs_reg(0.0f))); 1175 mlen++; 1176 } else if (ir->op == ir_txb) { 1177 ir->lod_info.bias->accept(this); 1178 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), 1179 this->result)); 1180 mlen++; 1181 } else { 1182 assert(ir->op == ir_txl); 1183 ir->lod_info.lod->accept(this); 1184 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), 1185 this->result)); 1186 mlen++; 1187 } 1188 1189 ir->shadow_comparitor->accept(this); 1190 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result)); 1191 mlen++; 1192 } else if (ir->op == ir_tex) { 1193 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1194 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), 1195 coordinate)); 1196 coordinate.reg_offset++; 1197 } 1198 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */ 1199 mlen += 3; 1200 } else { 1201 /* Oh joy. gen4 doesn't have SIMD8 non-shadow-compare bias/lod 1202 * instructions. We'll need to do SIMD16 here. 1203 */ 1204 assert(ir->op == ir_txb || ir->op == ir_txl); 1205 1206 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1207 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2), 1208 coordinate)); 1209 coordinate.reg_offset++; 1210 } 1211 1212 /* lod/bias appears after u/v/r. */ 1213 mlen += 6; 1214 1215 if (ir->op == ir_txb) { 1216 ir->lod_info.bias->accept(this); 1217 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), 1218 this->result)); 1219 mlen++; 1220 } else { 1221 ir->lod_info.lod->accept(this); 1222 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), 1223 this->result)); 1224 mlen++; 1225 } 1226 1227 /* The unused upper half. */ 1228 mlen++; 1229 1230 /* Now, since we're doing simd16, the return is 2 interleaved 1231 * vec4s where the odd-indexed ones are junk. We'll need to move 1232 * this weirdness around to the expected layout. 1233 */ 1234 simd16 = true; 1235 orig_dst = dst; 1236 dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 1237 2)); 1238 dst.type = BRW_REGISTER_TYPE_F; 1239 } 1240 1241 fs_inst *inst = NULL; 1242 switch (ir->op) { 1243 case ir_tex: 1244 inst = emit(fs_inst(FS_OPCODE_TEX, dst)); 1245 break; 1246 case ir_txb: 1247 inst = emit(fs_inst(FS_OPCODE_TXB, dst)); 1248 break; 1249 case ir_txl: 1250 inst = emit(fs_inst(FS_OPCODE_TXL, dst)); 1251 break; 1252 case ir_txd: 1253 case ir_txf: 1254 assert(!"GLSL 1.30 features unsupported"); 1255 break; 1256 } 1257 inst->base_mrf = base_mrf; 1258 inst->mlen = mlen; 1259 1260 if (simd16) { 1261 for (int i = 0; i < 4; i++) { 1262 emit(fs_inst(BRW_OPCODE_MOV, orig_dst, dst)); 1263 orig_dst.reg_offset++; 1264 dst.reg_offset += 2; 1265 } 1266 } 1267 1268 return inst; 1269} 1270 1271fs_inst * 1272fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate) 1273{ 1274 /* gen5's SIMD8 sampler has slots for u, v, r, array index, then 1275 * optional parameters like shadow comparitor or LOD bias. If 1276 * optional parameters aren't present, those base slots are 1277 * optional and don't need to be included in the message. 1278 * 1279 * We don't fill in the unnecessary slots regardless, which may 1280 * look surprising in the disassembly. 1281 */ 1282 int mlen = 1; /* g0 header always present. */ 1283 int base_mrf = 1; 1284 1285 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1286 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), 1287 coordinate)); 1288 coordinate.reg_offset++; 1289 } 1290 mlen += ir->coordinate->type->vector_elements; 1291 1292 if (ir->shadow_comparitor) { 1293 mlen = MAX2(mlen, 5); 1294 1295 ir->shadow_comparitor->accept(this); 1296 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result)); 1297 mlen++; 1298 } 1299 1300 fs_inst *inst = NULL; 1301 switch (ir->op) { 1302 case ir_tex: 1303 inst = emit(fs_inst(FS_OPCODE_TEX, dst)); 1304 break; 1305 case ir_txb: 1306 ir->lod_info.bias->accept(this); 1307 mlen = MAX2(mlen, 5); 1308 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result)); 1309 mlen++; 1310 1311 inst = emit(fs_inst(FS_OPCODE_TXB, dst)); 1312 break; 1313 case ir_txl: 1314 ir->lod_info.lod->accept(this); 1315 mlen = MAX2(mlen, 5); 1316 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result)); 1317 mlen++; 1318 1319 inst = emit(fs_inst(FS_OPCODE_TXL, dst)); 1320 break; 1321 case ir_txd: 1322 case ir_txf: 1323 assert(!"GLSL 1.30 features unsupported"); 1324 break; 1325 } 1326 inst->base_mrf = base_mrf; 1327 inst->mlen = mlen; 1328 1329 return inst; 1330} 1331 1332void 1333fs_visitor::visit(ir_texture *ir) 1334{ 1335 int sampler; 1336 fs_inst *inst = NULL; 1337 1338 ir->coordinate->accept(this); 1339 fs_reg coordinate = this->result; 1340 1341 /* Should be lowered by do_lower_texture_projection */ 1342 assert(!ir->projector); 1343 1344 sampler = _mesa_get_sampler_uniform_value(ir->sampler, 1345 ctx->Shader.CurrentFragmentProgram, 1346 &brw->fragment_program->Base); 1347 sampler = c->fp->program.Base.SamplerUnits[sampler]; 1348 1349 /* The 965 requires the EU to do the normalization of GL rectangle 1350 * texture coordinates. We use the program parameter state 1351 * tracking to get the scaling factor. 1352 */ 1353 if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) { 1354 struct gl_program_parameter_list *params = c->fp->program.Base.Parameters; 1355 int tokens[STATE_LENGTH] = { 1356 STATE_INTERNAL, 1357 STATE_TEXRECT_SCALE, 1358 sampler, 1359 0, 1360 0 1361 }; 1362 1363 c->prog_data.param_convert[c->prog_data.nr_params] = 1364 PARAM_NO_CONVERT; 1365 c->prog_data.param_convert[c->prog_data.nr_params + 1] = 1366 PARAM_NO_CONVERT; 1367 1368 fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params); 1369 fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1); 1370 GLuint index = _mesa_add_state_reference(params, 1371 (gl_state_index *)tokens); 1372 1373 this->param_index[c->prog_data.nr_params] = index; 1374 this->param_offset[c->prog_data.nr_params] = 0; 1375 c->prog_data.nr_params++; 1376 this->param_index[c->prog_data.nr_params] = index; 1377 this->param_offset[c->prog_data.nr_params] = 1; 1378 c->prog_data.nr_params++; 1379 1380 fs_reg dst = fs_reg(this, ir->coordinate->type); 1381 fs_reg src = coordinate; 1382 coordinate = dst; 1383 1384 emit(fs_inst(BRW_OPCODE_MUL, dst, src, scale_x)); 1385 dst.reg_offset++; 1386 src.reg_offset++; 1387 emit(fs_inst(BRW_OPCODE_MUL, dst, src, scale_y)); 1388 } 1389 1390 /* Writemasking doesn't eliminate channels on SIMD8 texture 1391 * samples, so don't worry about them. 1392 */ 1393 fs_reg dst = fs_reg(this, glsl_type::vec4_type); 1394 1395 if (intel->gen < 5) { 1396 inst = emit_texture_gen4(ir, dst, coordinate); 1397 } else { 1398 inst = emit_texture_gen5(ir, dst, coordinate); 1399 } 1400 1401 inst->sampler = sampler; 1402 1403 this->result = dst; 1404 1405 if (ir->shadow_comparitor) 1406 inst->shadow_compare = true; 1407 1408 if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) { 1409 fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type); 1410 1411 for (int i = 0; i < 4; i++) { 1412 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i); 1413 fs_reg l = swizzle_dst; 1414 l.reg_offset += i; 1415 1416 if (swiz == SWIZZLE_ZERO) { 1417 emit(fs_inst(BRW_OPCODE_MOV, l, fs_reg(0.0f))); 1418 } else if (swiz == SWIZZLE_ONE) { 1419 emit(fs_inst(BRW_OPCODE_MOV, l, fs_reg(1.0f))); 1420 } else { 1421 fs_reg r = dst; 1422 r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i); 1423 emit(fs_inst(BRW_OPCODE_MOV, l, r)); 1424 } 1425 } 1426 this->result = swizzle_dst; 1427 } 1428} 1429 1430void 1431fs_visitor::visit(ir_swizzle *ir) 1432{ 1433 ir->val->accept(this); 1434 fs_reg val = this->result; 1435 1436 if (ir->type->vector_elements == 1) { 1437 this->result.reg_offset += ir->mask.x; 1438 return; 1439 } 1440 1441 fs_reg result = fs_reg(this, ir->type); 1442 this->result = result; 1443 1444 for (unsigned int i = 0; i < ir->type->vector_elements; i++) { 1445 fs_reg channel = val; 1446 int swiz = 0; 1447 1448 switch (i) { 1449 case 0: 1450 swiz = ir->mask.x; 1451 break; 1452 case 1: 1453 swiz = ir->mask.y; 1454 break; 1455 case 2: 1456 swiz = ir->mask.z; 1457 break; 1458 case 3: 1459 swiz = ir->mask.w; 1460 break; 1461 } 1462 1463 channel.reg_offset += swiz; 1464 emit(fs_inst(BRW_OPCODE_MOV, result, channel)); 1465 result.reg_offset++; 1466 } 1467} 1468 1469void 1470fs_visitor::visit(ir_discard *ir) 1471{ 1472 fs_reg temp = fs_reg(this, glsl_type::uint_type); 1473 1474 assert(ir->condition == NULL); /* FINISHME */ 1475 1476 emit(fs_inst(FS_OPCODE_DISCARD_NOT, temp, reg_null_d)); 1477 emit(fs_inst(FS_OPCODE_DISCARD_AND, reg_null_d, temp)); 1478 kill_emitted = true; 1479} 1480 1481void 1482fs_visitor::visit(ir_constant *ir) 1483{ 1484 /* Set this->result to reg at the bottom of the function because some code 1485 * paths will cause this visitor to be applied to other fields. This will 1486 * cause the value stored in this->result to be modified. 1487 * 1488 * Make reg constant so that it doesn't get accidentally modified along the 1489 * way. Yes, I actually had this problem. :( 1490 */ 1491 const fs_reg reg(this, ir->type); 1492 fs_reg dst_reg = reg; 1493 1494 if (ir->type->is_array()) { 1495 const unsigned size = type_size(ir->type->fields.array); 1496 1497 for (unsigned i = 0; i < ir->type->length; i++) { 1498 ir->array_elements[i]->accept(this); 1499 fs_reg src_reg = this->result; 1500 1501 dst_reg.type = src_reg.type; 1502 for (unsigned j = 0; j < size; j++) { 1503 emit(fs_inst(BRW_OPCODE_MOV, dst_reg, src_reg)); 1504 src_reg.reg_offset++; 1505 dst_reg.reg_offset++; 1506 } 1507 } 1508 } else if (ir->type->is_record()) { 1509 foreach_list(node, &ir->components) { 1510 ir_instruction *const field = (ir_instruction *) node; 1511 const unsigned size = type_size(field->type); 1512 1513 field->accept(this); 1514 fs_reg src_reg = this->result; 1515 1516 dst_reg.type = src_reg.type; 1517 for (unsigned j = 0; j < size; j++) { 1518 emit(fs_inst(BRW_OPCODE_MOV, dst_reg, src_reg)); 1519 src_reg.reg_offset++; 1520 dst_reg.reg_offset++; 1521 } 1522 } 1523 } else { 1524 const unsigned size = type_size(ir->type); 1525 1526 for (unsigned i = 0; i < size; i++) { 1527 switch (ir->type->base_type) { 1528 case GLSL_TYPE_FLOAT: 1529 emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i]))); 1530 break; 1531 case GLSL_TYPE_UINT: 1532 emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i]))); 1533 break; 1534 case GLSL_TYPE_INT: 1535 emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i]))); 1536 break; 1537 case GLSL_TYPE_BOOL: 1538 emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i]))); 1539 break; 1540 default: 1541 assert(!"Non-float/uint/int/bool constant"); 1542 } 1543 dst_reg.reg_offset++; 1544 } 1545 } 1546 1547 this->result = reg; 1548} 1549 1550void 1551fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir) 1552{ 1553 ir_expression *expr = ir->as_expression(); 1554 1555 if (expr) { 1556 fs_reg op[2]; 1557 fs_inst *inst; 1558 1559 assert(expr->get_num_operands() <= 2); 1560 for (unsigned int i = 0; i < expr->get_num_operands(); i++) { 1561 assert(expr->operands[i]->type->is_scalar()); 1562 1563 expr->operands[i]->accept(this); 1564 op[i] = this->result; 1565 } 1566 1567 switch (expr->operation) { 1568 case ir_unop_logic_not: 1569 inst = emit(fs_inst(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1))); 1570 inst->conditional_mod = BRW_CONDITIONAL_Z; 1571 break; 1572 1573 case ir_binop_logic_xor: 1574 inst = emit(fs_inst(BRW_OPCODE_XOR, reg_null_d, op[0], op[1])); 1575 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1576 break; 1577 1578 case ir_binop_logic_or: 1579 inst = emit(fs_inst(BRW_OPCODE_OR, reg_null_d, op[0], op[1])); 1580 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1581 break; 1582 1583 case ir_binop_logic_and: 1584 inst = emit(fs_inst(BRW_OPCODE_AND, reg_null_d, op[0], op[1])); 1585 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1586 break; 1587 1588 case ir_unop_f2b: 1589 if (intel->gen >= 6) { 1590 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, 1591 op[0], fs_reg(0.0f))); 1592 } else { 1593 inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_f, op[0])); 1594 } 1595 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1596 break; 1597 1598 case ir_unop_i2b: 1599 if (intel->gen >= 6) { 1600 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0))); 1601 } else { 1602 inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_d, op[0])); 1603 } 1604 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1605 break; 1606 1607 case ir_binop_greater: 1608 case ir_binop_gequal: 1609 case ir_binop_less: 1610 case ir_binop_lequal: 1611 case ir_binop_equal: 1612 case ir_binop_all_equal: 1613 case ir_binop_nequal: 1614 case ir_binop_any_nequal: 1615 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1])); 1616 inst->conditional_mod = 1617 brw_conditional_for_comparison(expr->operation); 1618 break; 1619 1620 default: 1621 assert(!"not reached"); 1622 this->fail = true; 1623 break; 1624 } 1625 return; 1626 } 1627 1628 ir->accept(this); 1629 1630 if (intel->gen >= 6) { 1631 fs_inst *inst = emit(fs_inst(BRW_OPCODE_AND, reg_null_d, 1632 this->result, fs_reg(1))); 1633 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1634 } else { 1635 fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_d, this->result)); 1636 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1637 } 1638} 1639 1640/** 1641 * Emit a gen6 IF statement with the comparison folded into the IF 1642 * instruction. 1643 */ 1644void 1645fs_visitor::emit_if_gen6(ir_if *ir) 1646{ 1647 ir_expression *expr = ir->condition->as_expression(); 1648 1649 if (expr) { 1650 fs_reg op[2]; 1651 fs_inst *inst; 1652 fs_reg temp; 1653 1654 assert(expr->get_num_operands() <= 2); 1655 for (unsigned int i = 0; i < expr->get_num_operands(); i++) { 1656 assert(expr->operands[i]->type->is_scalar()); 1657 1658 expr->operands[i]->accept(this); 1659 op[i] = this->result; 1660 } 1661 1662 switch (expr->operation) { 1663 case ir_unop_logic_not: 1664 inst = emit(fs_inst(BRW_OPCODE_IF, temp, op[0], fs_reg(0))); 1665 inst->conditional_mod = BRW_CONDITIONAL_Z; 1666 return; 1667 1668 case ir_binop_logic_xor: 1669 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1])); 1670 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1671 return; 1672 1673 case ir_binop_logic_or: 1674 temp = fs_reg(this, glsl_type::bool_type); 1675 emit(fs_inst(BRW_OPCODE_OR, temp, op[0], op[1])); 1676 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0))); 1677 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1678 return; 1679 1680 case ir_binop_logic_and: 1681 temp = fs_reg(this, glsl_type::bool_type); 1682 emit(fs_inst(BRW_OPCODE_AND, temp, op[0], op[1])); 1683 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0))); 1684 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1685 return; 1686 1687 case ir_unop_f2b: 1688 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0))); 1689 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1690 return; 1691 1692 case ir_unop_i2b: 1693 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0))); 1694 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1695 return; 1696 1697 case ir_binop_greater: 1698 case ir_binop_gequal: 1699 case ir_binop_less: 1700 case ir_binop_lequal: 1701 case ir_binop_equal: 1702 case ir_binop_all_equal: 1703 case ir_binop_nequal: 1704 case ir_binop_any_nequal: 1705 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1])); 1706 inst->conditional_mod = 1707 brw_conditional_for_comparison(expr->operation); 1708 return; 1709 default: 1710 assert(!"not reached"); 1711 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0))); 1712 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1713 this->fail = true; 1714 return; 1715 } 1716 return; 1717 } 1718 1719 ir->condition->accept(this); 1720 1721 fs_inst *inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0))); 1722 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1723} 1724 1725void 1726fs_visitor::visit(ir_if *ir) 1727{ 1728 fs_inst *inst; 1729 1730 /* Don't point the annotation at the if statement, because then it plus 1731 * the then and else blocks get printed. 1732 */ 1733 this->base_ir = ir->condition; 1734 1735 if (intel->gen >= 6) { 1736 emit_if_gen6(ir); 1737 } else { 1738 emit_bool_to_cond_code(ir->condition); 1739 1740 inst = emit(fs_inst(BRW_OPCODE_IF)); 1741 inst->predicated = true; 1742 } 1743 1744 foreach_iter(exec_list_iterator, iter, ir->then_instructions) { 1745 ir_instruction *ir = (ir_instruction *)iter.get(); 1746 this->base_ir = ir; 1747 1748 ir->accept(this); 1749 } 1750 1751 if (!ir->else_instructions.is_empty()) { 1752 emit(fs_inst(BRW_OPCODE_ELSE)); 1753 1754 foreach_iter(exec_list_iterator, iter, ir->else_instructions) { 1755 ir_instruction *ir = (ir_instruction *)iter.get(); 1756 this->base_ir = ir; 1757 1758 ir->accept(this); 1759 } 1760 } 1761 1762 emit(fs_inst(BRW_OPCODE_ENDIF)); 1763} 1764 1765void 1766fs_visitor::visit(ir_loop *ir) 1767{ 1768 fs_reg counter = reg_undef; 1769 1770 if (ir->counter) { 1771 this->base_ir = ir->counter; 1772 ir->counter->accept(this); 1773 counter = *(variable_storage(ir->counter)); 1774 1775 if (ir->from) { 1776 this->base_ir = ir->from; 1777 ir->from->accept(this); 1778 1779 emit(fs_inst(BRW_OPCODE_MOV, counter, this->result)); 1780 } 1781 } 1782 1783 emit(fs_inst(BRW_OPCODE_DO)); 1784 1785 if (ir->to) { 1786 this->base_ir = ir->to; 1787 ir->to->accept(this); 1788 1789 fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_cmp, 1790 counter, this->result)); 1791 inst->conditional_mod = brw_conditional_for_comparison(ir->cmp); 1792 1793 inst = emit(fs_inst(BRW_OPCODE_BREAK)); 1794 inst->predicated = true; 1795 } 1796 1797 foreach_iter(exec_list_iterator, iter, ir->body_instructions) { 1798 ir_instruction *ir = (ir_instruction *)iter.get(); 1799 1800 this->base_ir = ir; 1801 ir->accept(this); 1802 } 1803 1804 if (ir->increment) { 1805 this->base_ir = ir->increment; 1806 ir->increment->accept(this); 1807 emit(fs_inst(BRW_OPCODE_ADD, counter, counter, this->result)); 1808 } 1809 1810 emit(fs_inst(BRW_OPCODE_WHILE)); 1811} 1812 1813void 1814fs_visitor::visit(ir_loop_jump *ir) 1815{ 1816 switch (ir->mode) { 1817 case ir_loop_jump::jump_break: 1818 emit(fs_inst(BRW_OPCODE_BREAK)); 1819 break; 1820 case ir_loop_jump::jump_continue: 1821 emit(fs_inst(BRW_OPCODE_CONTINUE)); 1822 break; 1823 } 1824} 1825 1826void 1827fs_visitor::visit(ir_call *ir) 1828{ 1829 assert(!"FINISHME"); 1830} 1831 1832void 1833fs_visitor::visit(ir_return *ir) 1834{ 1835 assert(!"FINISHME"); 1836} 1837 1838void 1839fs_visitor::visit(ir_function *ir) 1840{ 1841 /* Ignore function bodies other than main() -- we shouldn't see calls to 1842 * them since they should all be inlined before we get to ir_to_mesa. 1843 */ 1844 if (strcmp(ir->name, "main") == 0) { 1845 const ir_function_signature *sig; 1846 exec_list empty; 1847 1848 sig = ir->matching_signature(&empty); 1849 1850 assert(sig); 1851 1852 foreach_iter(exec_list_iterator, iter, sig->body) { 1853 ir_instruction *ir = (ir_instruction *)iter.get(); 1854 this->base_ir = ir; 1855 1856 ir->accept(this); 1857 } 1858 } 1859} 1860 1861void 1862fs_visitor::visit(ir_function_signature *ir) 1863{ 1864 assert(!"not reached"); 1865 (void)ir; 1866} 1867 1868fs_inst * 1869fs_visitor::emit(fs_inst inst) 1870{ 1871 fs_inst *list_inst = new(mem_ctx) fs_inst; 1872 *list_inst = inst; 1873 1874 list_inst->annotation = this->current_annotation; 1875 list_inst->ir = this->base_ir; 1876 1877 this->instructions.push_tail(list_inst); 1878 1879 return list_inst; 1880} 1881 1882/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */ 1883void 1884fs_visitor::emit_dummy_fs() 1885{ 1886 /* Everyone's favorite color. */ 1887 emit(fs_inst(BRW_OPCODE_MOV, 1888 fs_reg(MRF, 2), 1889 fs_reg(1.0f))); 1890 emit(fs_inst(BRW_OPCODE_MOV, 1891 fs_reg(MRF, 3), 1892 fs_reg(0.0f))); 1893 emit(fs_inst(BRW_OPCODE_MOV, 1894 fs_reg(MRF, 4), 1895 fs_reg(1.0f))); 1896 emit(fs_inst(BRW_OPCODE_MOV, 1897 fs_reg(MRF, 5), 1898 fs_reg(0.0f))); 1899 1900 fs_inst *write; 1901 write = emit(fs_inst(FS_OPCODE_FB_WRITE, 1902 fs_reg(0), 1903 fs_reg(0))); 1904 write->base_mrf = 0; 1905} 1906 1907/* The register location here is relative to the start of the URB 1908 * data. It will get adjusted to be a real location before 1909 * generate_code() time. 1910 */ 1911struct brw_reg 1912fs_visitor::interp_reg(int location, int channel) 1913{ 1914 int regnr = urb_setup[location] * 2 + channel / 2; 1915 int stride = (channel & 1) * 4; 1916 1917 assert(urb_setup[location] != -1); 1918 1919 return brw_vec1_grf(regnr, stride); 1920} 1921 1922/** Emits the interpolation for the varying inputs. */ 1923void 1924fs_visitor::emit_interpolation_setup_gen4() 1925{ 1926 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW); 1927 1928 this->current_annotation = "compute pixel centers"; 1929 this->pixel_x = fs_reg(this, glsl_type::uint_type); 1930 this->pixel_y = fs_reg(this, glsl_type::uint_type); 1931 this->pixel_x.type = BRW_REGISTER_TYPE_UW; 1932 this->pixel_y.type = BRW_REGISTER_TYPE_UW; 1933 emit(fs_inst(BRW_OPCODE_ADD, 1934 this->pixel_x, 1935 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)), 1936 fs_reg(brw_imm_v(0x10101010)))); 1937 emit(fs_inst(BRW_OPCODE_ADD, 1938 this->pixel_y, 1939 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)), 1940 fs_reg(brw_imm_v(0x11001100)))); 1941 1942 this->current_annotation = "compute pixel deltas from v0"; 1943 if (brw->has_pln) { 1944 this->delta_x = fs_reg(this, glsl_type::vec2_type); 1945 this->delta_y = this->delta_x; 1946 this->delta_y.reg_offset++; 1947 } else { 1948 this->delta_x = fs_reg(this, glsl_type::float_type); 1949 this->delta_y = fs_reg(this, glsl_type::float_type); 1950 } 1951 emit(fs_inst(BRW_OPCODE_ADD, 1952 this->delta_x, 1953 this->pixel_x, 1954 fs_reg(negate(brw_vec1_grf(1, 0))))); 1955 emit(fs_inst(BRW_OPCODE_ADD, 1956 this->delta_y, 1957 this->pixel_y, 1958 fs_reg(negate(brw_vec1_grf(1, 1))))); 1959 1960 this->current_annotation = "compute pos.w and 1/pos.w"; 1961 /* Compute wpos.w. It's always in our setup, since it's needed to 1962 * interpolate the other attributes. 1963 */ 1964 this->wpos_w = fs_reg(this, glsl_type::float_type); 1965 emit(fs_inst(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y, 1966 interp_reg(FRAG_ATTRIB_WPOS, 3))); 1967 /* Compute the pixel 1/W value from wpos.w. */ 1968 this->pixel_w = fs_reg(this, glsl_type::float_type); 1969 emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w); 1970 this->current_annotation = NULL; 1971} 1972 1973/** Emits the interpolation for the varying inputs. */ 1974void 1975fs_visitor::emit_interpolation_setup_gen6() 1976{ 1977 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW); 1978 1979 /* If the pixel centers end up used, the setup is the same as for gen4. */ 1980 this->current_annotation = "compute pixel centers"; 1981 fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type); 1982 fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type); 1983 int_pixel_x.type = BRW_REGISTER_TYPE_UW; 1984 int_pixel_y.type = BRW_REGISTER_TYPE_UW; 1985 emit(fs_inst(BRW_OPCODE_ADD, 1986 int_pixel_x, 1987 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)), 1988 fs_reg(brw_imm_v(0x10101010)))); 1989 emit(fs_inst(BRW_OPCODE_ADD, 1990 int_pixel_y, 1991 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)), 1992 fs_reg(brw_imm_v(0x11001100)))); 1993 1994 /* As of gen6, we can no longer mix float and int sources. We have 1995 * to turn the integer pixel centers into floats for their actual 1996 * use. 1997 */ 1998 this->pixel_x = fs_reg(this, glsl_type::float_type); 1999 this->pixel_y = fs_reg(this, glsl_type::float_type); 2000 emit(fs_inst(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x)); 2001 emit(fs_inst(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y)); 2002 2003 this->current_annotation = "compute 1/pos.w"; 2004 this->wpos_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0)); 2005 this->pixel_w = fs_reg(this, glsl_type::float_type); 2006 emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w); 2007 2008 this->delta_x = fs_reg(brw_vec8_grf(2, 0)); 2009 this->delta_y = fs_reg(brw_vec8_grf(3, 0)); 2010 2011 this->current_annotation = NULL; 2012} 2013 2014void 2015fs_visitor::emit_fb_writes() 2016{ 2017 this->current_annotation = "FB write header"; 2018 GLboolean header_present = GL_TRUE; 2019 int nr = 0; 2020 2021 if (intel->gen >= 6 && 2022 !this->kill_emitted && 2023 c->key.nr_color_regions == 1) { 2024 header_present = false; 2025 } 2026 2027 if (header_present) { 2028 /* m0, m1 header */ 2029 nr += 2; 2030 } 2031 2032 if (c->aa_dest_stencil_reg) { 2033 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++), 2034 fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)))); 2035 } 2036 2037 /* Reserve space for color. It'll be filled in per MRT below. */ 2038 int color_mrf = nr; 2039 nr += 4; 2040 2041 if (c->source_depth_to_render_target) { 2042 if (c->computes_depth) { 2043 /* Hand over gl_FragDepth. */ 2044 assert(this->frag_depth); 2045 fs_reg depth = *(variable_storage(this->frag_depth)); 2046 2047 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth)); 2048 } else { 2049 /* Pass through the payload depth. */ 2050 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++), 2051 fs_reg(brw_vec8_grf(c->source_depth_reg, 0)))); 2052 } 2053 } 2054 2055 if (c->dest_depth_reg) { 2056 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++), 2057 fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)))); 2058 } 2059 2060 fs_reg color = reg_undef; 2061 if (this->frag_color) 2062 color = *(variable_storage(this->frag_color)); 2063 else if (this->frag_data) { 2064 color = *(variable_storage(this->frag_data)); 2065 color.type = BRW_REGISTER_TYPE_F; 2066 } 2067 2068 for (int target = 0; target < c->key.nr_color_regions; target++) { 2069 this->current_annotation = talloc_asprintf(this->mem_ctx, 2070 "FB write target %d", 2071 target); 2072 if (this->frag_color || this->frag_data) { 2073 for (int i = 0; i < 4; i++) { 2074 emit(fs_inst(BRW_OPCODE_MOV, 2075 fs_reg(MRF, color_mrf + i), 2076 color)); 2077 color.reg_offset++; 2078 } 2079 } 2080 2081 if (this->frag_color) 2082 color.reg_offset -= 4; 2083 2084 fs_inst *inst = emit(fs_inst(FS_OPCODE_FB_WRITE, 2085 reg_undef, reg_undef)); 2086 inst->target = target; 2087 inst->base_mrf = 0; 2088 inst->mlen = nr; 2089 if (target == c->key.nr_color_regions - 1) 2090 inst->eot = true; 2091 inst->header_present = header_present; 2092 } 2093 2094 if (c->key.nr_color_regions == 0) { 2095 fs_inst *inst = emit(fs_inst(FS_OPCODE_FB_WRITE, 2096 reg_undef, reg_undef)); 2097 inst->base_mrf = 0; 2098 inst->mlen = nr; 2099 inst->eot = true; 2100 inst->header_present = header_present; 2101 } 2102 2103 this->current_annotation = NULL; 2104} 2105 2106void 2107fs_visitor::generate_fb_write(fs_inst *inst) 2108{ 2109 GLboolean eot = inst->eot; 2110 struct brw_reg implied_header; 2111 2112 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied 2113 * move, here's g1. 2114 */ 2115 brw_push_insn_state(p); 2116 brw_set_mask_control(p, BRW_MASK_DISABLE); 2117 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2118 2119 if (inst->header_present) { 2120 if (intel->gen >= 6) { 2121 brw_MOV(p, 2122 brw_message_reg(inst->base_mrf), 2123 brw_vec8_grf(0, 0)); 2124 2125 if (inst->target > 0) { 2126 /* Set the render target index for choosing BLEND_STATE. */ 2127 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2), 2128 BRW_REGISTER_TYPE_UD), 2129 brw_imm_ud(inst->target)); 2130 } 2131 2132 /* Clear viewport index, render target array index. */ 2133 brw_AND(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 0), 2134 BRW_REGISTER_TYPE_UD), 2135 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), 2136 brw_imm_ud(0xf7ff)); 2137 2138 implied_header = brw_null_reg(); 2139 } else { 2140 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); 2141 } 2142 2143 brw_MOV(p, 2144 brw_message_reg(inst->base_mrf + 1), 2145 brw_vec8_grf(1, 0)); 2146 } else { 2147 implied_header = brw_null_reg(); 2148 } 2149 2150 brw_pop_insn_state(p); 2151 2152 brw_fb_WRITE(p, 2153 8, /* dispatch_width */ 2154 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW), 2155 inst->base_mrf, 2156 implied_header, 2157 inst->target, 2158 inst->mlen, 2159 0, 2160 eot, 2161 inst->header_present); 2162} 2163 2164void 2165fs_visitor::generate_linterp(fs_inst *inst, 2166 struct brw_reg dst, struct brw_reg *src) 2167{ 2168 struct brw_reg delta_x = src[0]; 2169 struct brw_reg delta_y = src[1]; 2170 struct brw_reg interp = src[2]; 2171 2172 if (brw->has_pln && 2173 delta_y.nr == delta_x.nr + 1 && 2174 (intel->gen >= 6 || (delta_x.nr & 1) == 0)) { 2175 brw_PLN(p, dst, interp, delta_x); 2176 } else { 2177 brw_LINE(p, brw_null_reg(), interp, delta_x); 2178 brw_MAC(p, dst, suboffset(interp, 1), delta_y); 2179 } 2180} 2181 2182void 2183fs_visitor::generate_math(fs_inst *inst, 2184 struct brw_reg dst, struct brw_reg *src) 2185{ 2186 int op; 2187 2188 switch (inst->opcode) { 2189 case FS_OPCODE_RCP: 2190 op = BRW_MATH_FUNCTION_INV; 2191 break; 2192 case FS_OPCODE_RSQ: 2193 op = BRW_MATH_FUNCTION_RSQ; 2194 break; 2195 case FS_OPCODE_SQRT: 2196 op = BRW_MATH_FUNCTION_SQRT; 2197 break; 2198 case FS_OPCODE_EXP2: 2199 op = BRW_MATH_FUNCTION_EXP; 2200 break; 2201 case FS_OPCODE_LOG2: 2202 op = BRW_MATH_FUNCTION_LOG; 2203 break; 2204 case FS_OPCODE_POW: 2205 op = BRW_MATH_FUNCTION_POW; 2206 break; 2207 case FS_OPCODE_SIN: 2208 op = BRW_MATH_FUNCTION_SIN; 2209 break; 2210 case FS_OPCODE_COS: 2211 op = BRW_MATH_FUNCTION_COS; 2212 break; 2213 default: 2214 assert(!"not reached: unknown math function"); 2215 op = 0; 2216 break; 2217 } 2218 2219 if (intel->gen >= 6) { 2220 assert(inst->mlen == 0); 2221 2222 if (inst->opcode == FS_OPCODE_POW) { 2223 brw_math2(p, dst, op, src[0], src[1]); 2224 } else { 2225 brw_math(p, dst, 2226 op, 2227 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 2228 BRW_MATH_SATURATE_NONE, 2229 0, src[0], 2230 BRW_MATH_DATA_VECTOR, 2231 BRW_MATH_PRECISION_FULL); 2232 } 2233 } else { 2234 assert(inst->mlen >= 1); 2235 2236 brw_math(p, dst, 2237 op, 2238 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 2239 BRW_MATH_SATURATE_NONE, 2240 inst->base_mrf, src[0], 2241 BRW_MATH_DATA_VECTOR, 2242 BRW_MATH_PRECISION_FULL); 2243 } 2244} 2245 2246void 2247fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst) 2248{ 2249 int msg_type = -1; 2250 int rlen = 4; 2251 uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; 2252 2253 if (intel->gen >= 5) { 2254 switch (inst->opcode) { 2255 case FS_OPCODE_TEX: 2256 if (inst->shadow_compare) { 2257 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5; 2258 } else { 2259 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5; 2260 } 2261 break; 2262 case FS_OPCODE_TXB: 2263 if (inst->shadow_compare) { 2264 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE_GEN5; 2265 } else { 2266 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5; 2267 } 2268 break; 2269 } 2270 } else { 2271 switch (inst->opcode) { 2272 case FS_OPCODE_TEX: 2273 /* Note that G45 and older determines shadow compare and dispatch width 2274 * from message length for most messages. 2275 */ 2276 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE; 2277 if (inst->shadow_compare) { 2278 assert(inst->mlen == 6); 2279 } else { 2280 assert(inst->mlen <= 4); 2281 } 2282 break; 2283 case FS_OPCODE_TXB: 2284 if (inst->shadow_compare) { 2285 assert(inst->mlen == 6); 2286 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE; 2287 } else { 2288 assert(inst->mlen == 9); 2289 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS; 2290 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 2291 } 2292 break; 2293 } 2294 } 2295 assert(msg_type != -1); 2296 2297 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) { 2298 rlen = 8; 2299 dst = vec16(dst); 2300 } 2301 2302 brw_SAMPLE(p, 2303 retype(dst, BRW_REGISTER_TYPE_UW), 2304 inst->base_mrf, 2305 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW), 2306 SURF_INDEX_TEXTURE(inst->sampler), 2307 inst->sampler, 2308 WRITEMASK_XYZW, 2309 msg_type, 2310 rlen, 2311 inst->mlen, 2312 0, 2313 1, 2314 simd_mode); 2315} 2316 2317 2318/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input 2319 * looking like: 2320 * 2321 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br 2322 * 2323 * and we're trying to produce: 2324 * 2325 * DDX DDY 2326 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl) 2327 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br) 2328 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl) 2329 * (ss0.br - ss0.bl) (ss0.tr - ss0.br) 2330 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl) 2331 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br) 2332 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl) 2333 * (ss1.br - ss1.bl) (ss1.tr - ss1.br) 2334 * 2335 * and add another set of two more subspans if in 16-pixel dispatch mode. 2336 * 2337 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result 2338 * for each pair, and vertstride = 2 jumps us 2 elements after processing a 2339 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled 2340 * between each other. We could probably do it like ddx and swizzle the right 2341 * order later, but bail for now and just produce 2342 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4) 2343 */ 2344void 2345fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 2346{ 2347 struct brw_reg src0 = brw_reg(src.file, src.nr, 1, 2348 BRW_REGISTER_TYPE_F, 2349 BRW_VERTICAL_STRIDE_2, 2350 BRW_WIDTH_2, 2351 BRW_HORIZONTAL_STRIDE_0, 2352 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2353 struct brw_reg src1 = brw_reg(src.file, src.nr, 0, 2354 BRW_REGISTER_TYPE_F, 2355 BRW_VERTICAL_STRIDE_2, 2356 BRW_WIDTH_2, 2357 BRW_HORIZONTAL_STRIDE_0, 2358 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2359 brw_ADD(p, dst, src0, negate(src1)); 2360} 2361 2362void 2363fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 2364{ 2365 struct brw_reg src0 = brw_reg(src.file, src.nr, 0, 2366 BRW_REGISTER_TYPE_F, 2367 BRW_VERTICAL_STRIDE_4, 2368 BRW_WIDTH_4, 2369 BRW_HORIZONTAL_STRIDE_0, 2370 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2371 struct brw_reg src1 = brw_reg(src.file, src.nr, 2, 2372 BRW_REGISTER_TYPE_F, 2373 BRW_VERTICAL_STRIDE_4, 2374 BRW_WIDTH_4, 2375 BRW_HORIZONTAL_STRIDE_0, 2376 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2377 brw_ADD(p, dst, src0, negate(src1)); 2378} 2379 2380void 2381fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask) 2382{ 2383 if (intel->gen >= 6) { 2384 /* Gen6 no longer has the mask reg for us to just read the 2385 * active channels from. However, cmp updates just the channels 2386 * of the flag reg that are enabled, so we can get at the 2387 * channel enables that way. In this step, make a reg of ones 2388 * we'll compare to. 2389 */ 2390 brw_MOV(p, mask, brw_imm_ud(1)); 2391 } else { 2392 brw_push_insn_state(p); 2393 brw_set_mask_control(p, BRW_MASK_DISABLE); 2394 brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */ 2395 brw_pop_insn_state(p); 2396 } 2397} 2398 2399void 2400fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask) 2401{ 2402 if (intel->gen >= 6) { 2403 struct brw_reg f0 = brw_flag_reg(); 2404 struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); 2405 2406 brw_push_insn_state(p); 2407 brw_set_mask_control(p, BRW_MASK_DISABLE); 2408 brw_MOV(p, f0, brw_imm_uw(0xffff)); /* inactive channels undiscarded */ 2409 brw_pop_insn_state(p); 2410 2411 brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), 2412 BRW_CONDITIONAL_Z, mask, brw_imm_ud(0)); /* active channels fail test */ 2413 /* Undo CMP's whacking of predication*/ 2414 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 2415 2416 brw_push_insn_state(p); 2417 brw_set_mask_control(p, BRW_MASK_DISABLE); 2418 brw_AND(p, g1, f0, g1); 2419 brw_pop_insn_state(p); 2420 } else { 2421 struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); 2422 2423 mask = brw_uw1_reg(mask.file, mask.nr, 0); 2424 2425 brw_push_insn_state(p); 2426 brw_set_mask_control(p, BRW_MASK_DISABLE); 2427 brw_AND(p, g0, mask, g0); 2428 brw_pop_insn_state(p); 2429 } 2430} 2431 2432void 2433fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src) 2434{ 2435 assert(inst->mlen != 0); 2436 2437 brw_MOV(p, 2438 retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD), 2439 retype(src, BRW_REGISTER_TYPE_UD)); 2440 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1, 2441 inst->offset); 2442} 2443 2444void 2445fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst) 2446{ 2447 assert(inst->mlen != 0); 2448 2449 /* Clear any post destination dependencies that would be ignored by 2450 * the block read. See the B-Spec for pre-gen5 send instruction. 2451 * 2452 * This could use a better solution, since texture sampling and 2453 * math reads could potentially run into it as well -- anywhere 2454 * that we have a SEND with a destination that is a register that 2455 * was written but not read within the last N instructions (what's 2456 * N? unsure). This is rare because of dead code elimination, but 2457 * not impossible. 2458 */ 2459 if (intel->gen == 4 && !intel->is_g4x) 2460 brw_MOV(p, brw_null_reg(), dst); 2461 2462 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1, 2463 inst->offset); 2464 2465 if (intel->gen == 4 && !intel->is_g4x) { 2466 /* gen4 errata: destination from a send can't be used as a 2467 * destination until it's been read. Just read it so we don't 2468 * have to worry. 2469 */ 2470 brw_MOV(p, brw_null_reg(), dst); 2471 } 2472} 2473 2474 2475void 2476fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst) 2477{ 2478 assert(inst->mlen != 0); 2479 2480 /* Clear any post destination dependencies that would be ignored by 2481 * the block read. See the B-Spec for pre-gen5 send instruction. 2482 * 2483 * This could use a better solution, since texture sampling and 2484 * math reads could potentially run into it as well -- anywhere 2485 * that we have a SEND with a destination that is a register that 2486 * was written but not read within the last N instructions (what's 2487 * N? unsure). This is rare because of dead code elimination, but 2488 * not impossible. 2489 */ 2490 if (intel->gen == 4 && !intel->is_g4x) 2491 brw_MOV(p, brw_null_reg(), dst); 2492 2493 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf), 2494 inst->offset, SURF_INDEX_FRAG_CONST_BUFFER); 2495 2496 if (intel->gen == 4 && !intel->is_g4x) { 2497 /* gen4 errata: destination from a send can't be used as a 2498 * destination until it's been read. Just read it so we don't 2499 * have to worry. 2500 */ 2501 brw_MOV(p, brw_null_reg(), dst); 2502 } 2503} 2504 2505/** 2506 * To be called after the last _mesa_add_state_reference() call, to 2507 * set up prog_data.param[] for assign_curb_setup() and 2508 * setup_pull_constants(). 2509 */ 2510void 2511fs_visitor::setup_paramvalues_refs() 2512{ 2513 /* Set up the pointers to ParamValues now that that array is finalized. */ 2514 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 2515 c->prog_data.param[i] = 2516 fp->Base.Parameters->ParameterValues[this->param_index[i]] + 2517 this->param_offset[i]; 2518 } 2519} 2520 2521void 2522fs_visitor::assign_curb_setup() 2523{ 2524 c->prog_data.first_curbe_grf = c->nr_payload_regs; 2525 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8; 2526 2527 /* Map the offsets in the UNIFORM file to fixed HW regs. */ 2528 foreach_iter(exec_list_iterator, iter, this->instructions) { 2529 fs_inst *inst = (fs_inst *)iter.get(); 2530 2531 for (unsigned int i = 0; i < 3; i++) { 2532 if (inst->src[i].file == UNIFORM) { 2533 int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 2534 struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf + 2535 constant_nr / 8, 2536 constant_nr % 8); 2537 2538 inst->src[i].file = FIXED_HW_REG; 2539 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type); 2540 } 2541 } 2542 } 2543} 2544 2545void 2546fs_visitor::calculate_urb_setup() 2547{ 2548 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 2549 urb_setup[i] = -1; 2550 } 2551 2552 int urb_next = 0; 2553 /* Figure out where each of the incoming setup attributes lands. */ 2554 if (intel->gen >= 6) { 2555 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 2556 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) { 2557 urb_setup[i] = urb_next++; 2558 } 2559 } 2560 } else { 2561 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */ 2562 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) { 2563 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) { 2564 int fp_index; 2565 2566 if (i >= VERT_RESULT_VAR0) 2567 fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0); 2568 else if (i <= VERT_RESULT_TEX7) 2569 fp_index = i; 2570 else 2571 fp_index = -1; 2572 2573 if (fp_index >= 0) 2574 urb_setup[fp_index] = urb_next++; 2575 } 2576 } 2577 } 2578 2579 /* Each attribute is 4 setup channels, each of which is half a reg. */ 2580 c->prog_data.urb_read_length = urb_next * 2; 2581} 2582 2583void 2584fs_visitor::assign_urb_setup() 2585{ 2586 int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length; 2587 2588 /* Offset all the urb_setup[] index by the actual position of the 2589 * setup regs, now that the location of the constants has been chosen. 2590 */ 2591 foreach_iter(exec_list_iterator, iter, this->instructions) { 2592 fs_inst *inst = (fs_inst *)iter.get(); 2593 2594 if (inst->opcode == FS_OPCODE_LINTERP) { 2595 assert(inst->src[2].file == FIXED_HW_REG); 2596 inst->src[2].fixed_hw_reg.nr += urb_start; 2597 } 2598 2599 if (inst->opcode == FS_OPCODE_CINTERP) { 2600 assert(inst->src[0].file == FIXED_HW_REG); 2601 inst->src[0].fixed_hw_reg.nr += urb_start; 2602 } 2603 } 2604 2605 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length; 2606} 2607 2608/** 2609 * Split large virtual GRFs into separate components if we can. 2610 * 2611 * This is mostly duplicated with what brw_fs_vector_splitting does, 2612 * but that's really conservative because it's afraid of doing 2613 * splitting that doesn't result in real progress after the rest of 2614 * the optimization phases, which would cause infinite looping in 2615 * optimization. We can do it once here, safely. This also has the 2616 * opportunity to split interpolated values, or maybe even uniforms, 2617 * which we don't have at the IR level. 2618 * 2619 * We want to split, because virtual GRFs are what we register 2620 * allocate and spill (due to contiguousness requirements for some 2621 * instructions), and they're what we naturally generate in the 2622 * codegen process, but most virtual GRFs don't actually need to be 2623 * contiguous sets of GRFs. If we split, we'll end up with reduced 2624 * live intervals and better dead code elimination and coalescing. 2625 */ 2626void 2627fs_visitor::split_virtual_grfs() 2628{ 2629 int num_vars = this->virtual_grf_next; 2630 bool split_grf[num_vars]; 2631 int new_virtual_grf[num_vars]; 2632 2633 /* Try to split anything > 0 sized. */ 2634 for (int i = 0; i < num_vars; i++) { 2635 if (this->virtual_grf_sizes[i] != 1) 2636 split_grf[i] = true; 2637 else 2638 split_grf[i] = false; 2639 } 2640 2641 if (brw->has_pln) { 2642 /* PLN opcodes rely on the delta_xy being contiguous. */ 2643 split_grf[this->delta_x.reg] = false; 2644 } 2645 2646 foreach_iter(exec_list_iterator, iter, this->instructions) { 2647 fs_inst *inst = (fs_inst *)iter.get(); 2648 2649 /* Texturing produces 4 contiguous registers, so no splitting. */ 2650 if (inst->is_tex()) { 2651 split_grf[inst->dst.reg] = false; 2652 } 2653 } 2654 2655 /* Allocate new space for split regs. Note that the virtual 2656 * numbers will be contiguous. 2657 */ 2658 for (int i = 0; i < num_vars; i++) { 2659 if (split_grf[i]) { 2660 new_virtual_grf[i] = virtual_grf_alloc(1); 2661 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) { 2662 int reg = virtual_grf_alloc(1); 2663 assert(reg == new_virtual_grf[i] + j - 1); 2664 (void) reg; 2665 } 2666 this->virtual_grf_sizes[i] = 1; 2667 } 2668 } 2669 2670 foreach_iter(exec_list_iterator, iter, this->instructions) { 2671 fs_inst *inst = (fs_inst *)iter.get(); 2672 2673 if (inst->dst.file == GRF && 2674 split_grf[inst->dst.reg] && 2675 inst->dst.reg_offset != 0) { 2676 inst->dst.reg = (new_virtual_grf[inst->dst.reg] + 2677 inst->dst.reg_offset - 1); 2678 inst->dst.reg_offset = 0; 2679 } 2680 for (int i = 0; i < 3; i++) { 2681 if (inst->src[i].file == GRF && 2682 split_grf[inst->src[i].reg] && 2683 inst->src[i].reg_offset != 0) { 2684 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] + 2685 inst->src[i].reg_offset - 1); 2686 inst->src[i].reg_offset = 0; 2687 } 2688 } 2689 } 2690 this->live_intervals_valid = false; 2691} 2692 2693/** 2694 * Choose accesses from the UNIFORM file to demote to using the pull 2695 * constant buffer. 2696 * 2697 * We allow a fragment shader to have more than the specified minimum 2698 * maximum number of fragment shader uniform components (64). If 2699 * there are too many of these, they'd fill up all of register space. 2700 * So, this will push some of them out to the pull constant buffer and 2701 * update the program to load them. 2702 */ 2703void 2704fs_visitor::setup_pull_constants() 2705{ 2706 /* Only allow 16 registers (128 uniform components) as push constants. */ 2707 unsigned int max_uniform_components = 16 * 8; 2708 if (c->prog_data.nr_params <= max_uniform_components) 2709 return; 2710 2711 /* Just demote the end of the list. We could probably do better 2712 * here, demoting things that are rarely used in the program first. 2713 */ 2714 int pull_uniform_base = max_uniform_components; 2715 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base; 2716 2717 foreach_iter(exec_list_iterator, iter, this->instructions) { 2718 fs_inst *inst = (fs_inst *)iter.get(); 2719 2720 for (int i = 0; i < 3; i++) { 2721 if (inst->src[i].file != UNIFORM) 2722 continue; 2723 2724 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 2725 if (uniform_nr < pull_uniform_base) 2726 continue; 2727 2728 fs_reg dst = fs_reg(this, glsl_type::float_type); 2729 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD, 2730 dst); 2731 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15; 2732 pull->ir = inst->ir; 2733 pull->annotation = inst->annotation; 2734 pull->base_mrf = 14; 2735 pull->mlen = 1; 2736 2737 inst->insert_before(pull); 2738 2739 inst->src[i].file = GRF; 2740 inst->src[i].reg = dst.reg; 2741 inst->src[i].reg_offset = 0; 2742 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3; 2743 } 2744 } 2745 2746 for (int i = 0; i < pull_uniform_count; i++) { 2747 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i]; 2748 c->prog_data.pull_param_convert[i] = 2749 c->prog_data.param_convert[pull_uniform_base + i]; 2750 } 2751 c->prog_data.nr_params -= pull_uniform_count; 2752 c->prog_data.nr_pull_params = pull_uniform_count; 2753} 2754 2755void 2756fs_visitor::calculate_live_intervals() 2757{ 2758 int num_vars = this->virtual_grf_next; 2759 int *def = talloc_array(mem_ctx, int, num_vars); 2760 int *use = talloc_array(mem_ctx, int, num_vars); 2761 int loop_depth = 0; 2762 int loop_start = 0; 2763 int bb_header_ip = 0; 2764 2765 if (this->live_intervals_valid) 2766 return; 2767 2768 for (int i = 0; i < num_vars; i++) { 2769 def[i] = MAX_INSTRUCTION; 2770 use[i] = -1; 2771 } 2772 2773 int ip = 0; 2774 foreach_iter(exec_list_iterator, iter, this->instructions) { 2775 fs_inst *inst = (fs_inst *)iter.get(); 2776 2777 if (inst->opcode == BRW_OPCODE_DO) { 2778 if (loop_depth++ == 0) 2779 loop_start = ip; 2780 } else if (inst->opcode == BRW_OPCODE_WHILE) { 2781 loop_depth--; 2782 2783 if (loop_depth == 0) { 2784 /* Patches up the use of vars marked for being live across 2785 * the whole loop. 2786 */ 2787 for (int i = 0; i < num_vars; i++) { 2788 if (use[i] == loop_start) { 2789 use[i] = ip; 2790 } 2791 } 2792 } 2793 } else { 2794 for (unsigned int i = 0; i < 3; i++) { 2795 if (inst->src[i].file == GRF && inst->src[i].reg != 0) { 2796 int reg = inst->src[i].reg; 2797 2798 if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 && 2799 def[reg] >= bb_header_ip)) { 2800 use[reg] = ip; 2801 } else { 2802 def[reg] = MIN2(loop_start, def[reg]); 2803 use[reg] = loop_start; 2804 2805 /* Nobody else is going to go smash our start to 2806 * later in the loop now, because def[reg] now 2807 * points before the bb header. 2808 */ 2809 } 2810 } 2811 } 2812 if (inst->dst.file == GRF && inst->dst.reg != 0) { 2813 int reg = inst->dst.reg; 2814 2815 if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 && 2816 !inst->predicated)) { 2817 def[reg] = MIN2(def[reg], ip); 2818 } else { 2819 def[reg] = MIN2(def[reg], loop_start); 2820 } 2821 } 2822 } 2823 2824 ip++; 2825 2826 /* Set the basic block header IP. This is used for determining 2827 * if a complete def of single-register virtual GRF in a loop 2828 * dominates a use in the same basic block. It's a quick way to 2829 * reduce the live interval range of most register used in a 2830 * loop. 2831 */ 2832 if (inst->opcode == BRW_OPCODE_IF || 2833 inst->opcode == BRW_OPCODE_ELSE || 2834 inst->opcode == BRW_OPCODE_ENDIF || 2835 inst->opcode == BRW_OPCODE_DO || 2836 inst->opcode == BRW_OPCODE_WHILE || 2837 inst->opcode == BRW_OPCODE_BREAK || 2838 inst->opcode == BRW_OPCODE_CONTINUE) { 2839 bb_header_ip = ip; 2840 } 2841 } 2842 2843 talloc_free(this->virtual_grf_def); 2844 talloc_free(this->virtual_grf_use); 2845 this->virtual_grf_def = def; 2846 this->virtual_grf_use = use; 2847 2848 this->live_intervals_valid = true; 2849} 2850 2851/** 2852 * Attempts to move immediate constants into the immediate 2853 * constant slot of following instructions. 2854 * 2855 * Immediate constants are a bit tricky -- they have to be in the last 2856 * operand slot, you can't do abs/negate on them, 2857 */ 2858 2859bool 2860fs_visitor::propagate_constants() 2861{ 2862 bool progress = false; 2863 2864 calculate_live_intervals(); 2865 2866 foreach_iter(exec_list_iterator, iter, this->instructions) { 2867 fs_inst *inst = (fs_inst *)iter.get(); 2868 2869 if (inst->opcode != BRW_OPCODE_MOV || 2870 inst->predicated || 2871 inst->dst.file != GRF || inst->src[0].file != IMM || 2872 inst->dst.type != inst->src[0].type) 2873 continue; 2874 2875 /* Don't bother with cases where we should have had the 2876 * operation on the constant folded in GLSL already. 2877 */ 2878 if (inst->saturate) 2879 continue; 2880 2881 /* Found a move of a constant to a GRF. Find anything else using the GRF 2882 * before it's written, and replace it with the constant if we can. 2883 */ 2884 exec_list_iterator scan_iter = iter; 2885 scan_iter.next(); 2886 for (; scan_iter.has_next(); scan_iter.next()) { 2887 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 2888 2889 if (scan_inst->opcode == BRW_OPCODE_DO || 2890 scan_inst->opcode == BRW_OPCODE_WHILE || 2891 scan_inst->opcode == BRW_OPCODE_ELSE || 2892 scan_inst->opcode == BRW_OPCODE_ENDIF) { 2893 break; 2894 } 2895 2896 for (int i = 2; i >= 0; i--) { 2897 if (scan_inst->src[i].file != GRF || 2898 scan_inst->src[i].reg != inst->dst.reg || 2899 scan_inst->src[i].reg_offset != inst->dst.reg_offset) 2900 continue; 2901 2902 /* Don't bother with cases where we should have had the 2903 * operation on the constant folded in GLSL already. 2904 */ 2905 if (scan_inst->src[i].negate || scan_inst->src[i].abs) 2906 continue; 2907 2908 switch (scan_inst->opcode) { 2909 case BRW_OPCODE_MOV: 2910 scan_inst->src[i] = inst->src[0]; 2911 progress = true; 2912 break; 2913 2914 case BRW_OPCODE_MUL: 2915 case BRW_OPCODE_ADD: 2916 if (i == 1) { 2917 scan_inst->src[i] = inst->src[0]; 2918 progress = true; 2919 } else if (i == 0 && scan_inst->src[1].file != IMM) { 2920 /* Fit this constant in by commuting the operands */ 2921 scan_inst->src[0] = scan_inst->src[1]; 2922 scan_inst->src[1] = inst->src[0]; 2923 progress = true; 2924 } 2925 break; 2926 case BRW_OPCODE_CMP: 2927 case BRW_OPCODE_SEL: 2928 if (i == 1) { 2929 scan_inst->src[i] = inst->src[0]; 2930 progress = true; 2931 } 2932 } 2933 } 2934 2935 if (scan_inst->dst.file == GRF && 2936 scan_inst->dst.reg == inst->dst.reg && 2937 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 2938 scan_inst->is_tex())) { 2939 break; 2940 } 2941 } 2942 } 2943 2944 if (progress) 2945 this->live_intervals_valid = false; 2946 2947 return progress; 2948} 2949/** 2950 * Must be called after calculate_live_intervales() to remove unused 2951 * writes to registers -- register allocation will fail otherwise 2952 * because something deffed but not used won't be considered to 2953 * interfere with other regs. 2954 */ 2955bool 2956fs_visitor::dead_code_eliminate() 2957{ 2958 bool progress = false; 2959 int pc = 0; 2960 2961 calculate_live_intervals(); 2962 2963 foreach_iter(exec_list_iterator, iter, this->instructions) { 2964 fs_inst *inst = (fs_inst *)iter.get(); 2965 2966 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) { 2967 inst->remove(); 2968 progress = true; 2969 } 2970 2971 pc++; 2972 } 2973 2974 if (progress) 2975 live_intervals_valid = false; 2976 2977 return progress; 2978} 2979 2980bool 2981fs_visitor::register_coalesce() 2982{ 2983 bool progress = false; 2984 int if_depth = 0; 2985 int loop_depth = 0; 2986 2987 foreach_iter(exec_list_iterator, iter, this->instructions) { 2988 fs_inst *inst = (fs_inst *)iter.get(); 2989 2990 /* Make sure that we dominate the instructions we're going to 2991 * scan for interfering with our coalescing, or we won't have 2992 * scanned enough to see if anything interferes with our 2993 * coalescing. We don't dominate the following instructions if 2994 * we're in a loop or an if block. 2995 */ 2996 switch (inst->opcode) { 2997 case BRW_OPCODE_DO: 2998 loop_depth++; 2999 break; 3000 case BRW_OPCODE_WHILE: 3001 loop_depth--; 3002 break; 3003 case BRW_OPCODE_IF: 3004 if_depth++; 3005 break; 3006 case BRW_OPCODE_ENDIF: 3007 if_depth--; 3008 break; 3009 } 3010 if (loop_depth || if_depth) 3011 continue; 3012 3013 if (inst->opcode != BRW_OPCODE_MOV || 3014 inst->predicated || 3015 inst->saturate || 3016 inst->dst.file != GRF || inst->src[0].file != GRF || 3017 inst->dst.type != inst->src[0].type) 3018 continue; 3019 3020 /* Found a move of a GRF to a GRF. Let's see if we can coalesce 3021 * them: check for no writes to either one until the exit of the 3022 * program. 3023 */ 3024 bool interfered = false; 3025 exec_list_iterator scan_iter = iter; 3026 scan_iter.next(); 3027 for (; scan_iter.has_next(); scan_iter.next()) { 3028 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 3029 3030 if (scan_inst->dst.file == GRF) { 3031 if (scan_inst->dst.reg == inst->dst.reg && 3032 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 3033 scan_inst->is_tex())) { 3034 interfered = true; 3035 break; 3036 } 3037 if (scan_inst->dst.reg == inst->src[0].reg && 3038 (scan_inst->dst.reg_offset == inst->src[0].reg_offset || 3039 scan_inst->is_tex())) { 3040 interfered = true; 3041 break; 3042 } 3043 } 3044 } 3045 if (interfered) { 3046 continue; 3047 } 3048 3049 /* Rewrite the later usage to point at the source of the move to 3050 * be removed. 3051 */ 3052 for (exec_list_iterator scan_iter = iter; scan_iter.has_next(); 3053 scan_iter.next()) { 3054 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 3055 3056 for (int i = 0; i < 3; i++) { 3057 if (scan_inst->src[i].file == GRF && 3058 scan_inst->src[i].reg == inst->dst.reg && 3059 scan_inst->src[i].reg_offset == inst->dst.reg_offset) { 3060 scan_inst->src[i].reg = inst->src[0].reg; 3061 scan_inst->src[i].reg_offset = inst->src[0].reg_offset; 3062 scan_inst->src[i].abs |= inst->src[0].abs; 3063 scan_inst->src[i].negate ^= inst->src[0].negate; 3064 scan_inst->src[i].smear = inst->src[0].smear; 3065 } 3066 } 3067 } 3068 3069 inst->remove(); 3070 progress = true; 3071 } 3072 3073 if (progress) 3074 live_intervals_valid = false; 3075 3076 return progress; 3077} 3078 3079 3080bool 3081fs_visitor::compute_to_mrf() 3082{ 3083 bool progress = false; 3084 int next_ip = 0; 3085 3086 calculate_live_intervals(); 3087 3088 foreach_iter(exec_list_iterator, iter, this->instructions) { 3089 fs_inst *inst = (fs_inst *)iter.get(); 3090 3091 int ip = next_ip; 3092 next_ip++; 3093 3094 if (inst->opcode != BRW_OPCODE_MOV || 3095 inst->predicated || 3096 inst->dst.file != MRF || inst->src[0].file != GRF || 3097 inst->dst.type != inst->src[0].type || 3098 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1) 3099 continue; 3100 3101 /* Can't compute-to-MRF this GRF if someone else was going to 3102 * read it later. 3103 */ 3104 if (this->virtual_grf_use[inst->src[0].reg] > ip) 3105 continue; 3106 3107 /* Found a move of a GRF to a MRF. Let's see if we can go 3108 * rewrite the thing that made this GRF to write into the MRF. 3109 */ 3110 fs_inst *scan_inst; 3111 for (scan_inst = (fs_inst *)inst->prev; 3112 scan_inst->prev != NULL; 3113 scan_inst = (fs_inst *)scan_inst->prev) { 3114 if (scan_inst->dst.file == GRF && 3115 scan_inst->dst.reg == inst->src[0].reg) { 3116 /* Found the last thing to write our reg we want to turn 3117 * into a compute-to-MRF. 3118 */ 3119 3120 if (scan_inst->is_tex()) { 3121 /* texturing writes several continuous regs, so we can't 3122 * compute-to-mrf that. 3123 */ 3124 break; 3125 } 3126 3127 /* If it's predicated, it (probably) didn't populate all 3128 * the channels. 3129 */ 3130 if (scan_inst->predicated) 3131 break; 3132 3133 /* SEND instructions can't have MRF as a destination. */ 3134 if (scan_inst->mlen) 3135 break; 3136 3137 if (intel->gen >= 6) { 3138 /* gen6 math instructions must have the destination be 3139 * GRF, so no compute-to-MRF for them. 3140 */ 3141 if (scan_inst->opcode == FS_OPCODE_RCP || 3142 scan_inst->opcode == FS_OPCODE_RSQ || 3143 scan_inst->opcode == FS_OPCODE_SQRT || 3144 scan_inst->opcode == FS_OPCODE_EXP2 || 3145 scan_inst->opcode == FS_OPCODE_LOG2 || 3146 scan_inst->opcode == FS_OPCODE_SIN || 3147 scan_inst->opcode == FS_OPCODE_COS || 3148 scan_inst->opcode == FS_OPCODE_POW) { 3149 break; 3150 } 3151 } 3152 3153 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 3154 /* Found the creator of our MRF's source value. */ 3155 scan_inst->dst.file = MRF; 3156 scan_inst->dst.hw_reg = inst->dst.hw_reg; 3157 scan_inst->saturate |= inst->saturate; 3158 inst->remove(); 3159 progress = true; 3160 } 3161 break; 3162 } 3163 3164 /* We don't handle flow control here. Most computation of 3165 * values that end up in MRFs are shortly before the MRF 3166 * write anyway. 3167 */ 3168 if (scan_inst->opcode == BRW_OPCODE_DO || 3169 scan_inst->opcode == BRW_OPCODE_WHILE || 3170 scan_inst->opcode == BRW_OPCODE_ELSE || 3171 scan_inst->opcode == BRW_OPCODE_ENDIF) { 3172 break; 3173 } 3174 3175 /* You can't read from an MRF, so if someone else reads our 3176 * MRF's source GRF that we wanted to rewrite, that stops us. 3177 */ 3178 bool interfered = false; 3179 for (int i = 0; i < 3; i++) { 3180 if (scan_inst->src[i].file == GRF && 3181 scan_inst->src[i].reg == inst->src[0].reg && 3182 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { 3183 interfered = true; 3184 } 3185 } 3186 if (interfered) 3187 break; 3188 3189 if (scan_inst->dst.file == MRF && 3190 scan_inst->dst.hw_reg == inst->dst.hw_reg) { 3191 /* Somebody else wrote our MRF here, so we can't can't 3192 * compute-to-MRF before that. 3193 */ 3194 break; 3195 } 3196 3197 if (scan_inst->mlen > 0) { 3198 /* Found a SEND instruction, which means that there are 3199 * live values in MRFs from base_mrf to base_mrf + 3200 * scan_inst->mlen - 1. Don't go pushing our MRF write up 3201 * above it. 3202 */ 3203 if (inst->dst.hw_reg >= scan_inst->base_mrf && 3204 inst->dst.hw_reg < scan_inst->base_mrf + scan_inst->mlen) { 3205 break; 3206 } 3207 } 3208 } 3209 } 3210 3211 return progress; 3212} 3213 3214/** 3215 * Walks through basic blocks, locking for repeated MRF writes and 3216 * removing the later ones. 3217 */ 3218bool 3219fs_visitor::remove_duplicate_mrf_writes() 3220{ 3221 fs_inst *last_mrf_move[16]; 3222 bool progress = false; 3223 3224 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 3225 3226 foreach_iter(exec_list_iterator, iter, this->instructions) { 3227 fs_inst *inst = (fs_inst *)iter.get(); 3228 3229 switch (inst->opcode) { 3230 case BRW_OPCODE_DO: 3231 case BRW_OPCODE_WHILE: 3232 case BRW_OPCODE_IF: 3233 case BRW_OPCODE_ELSE: 3234 case BRW_OPCODE_ENDIF: 3235 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 3236 continue; 3237 default: 3238 break; 3239 } 3240 3241 if (inst->opcode == BRW_OPCODE_MOV && 3242 inst->dst.file == MRF) { 3243 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg]; 3244 if (prev_inst && inst->equals(prev_inst)) { 3245 inst->remove(); 3246 progress = true; 3247 continue; 3248 } 3249 } 3250 3251 /* Clear out the last-write records for MRFs that were overwritten. */ 3252 if (inst->dst.file == MRF) { 3253 last_mrf_move[inst->dst.hw_reg] = NULL; 3254 } 3255 3256 if (inst->mlen > 0) { 3257 /* Found a SEND instruction, which will include two or fewer 3258 * implied MRF writes. We could do better here. 3259 */ 3260 for (int i = 0; i < implied_mrf_writes(inst); i++) { 3261 last_mrf_move[inst->base_mrf + i] = NULL; 3262 } 3263 } 3264 3265 /* Clear out any MRF move records whose sources got overwritten. */ 3266 if (inst->dst.file == GRF) { 3267 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) { 3268 if (last_mrf_move[i] && 3269 last_mrf_move[i]->src[0].reg == inst->dst.reg) { 3270 last_mrf_move[i] = NULL; 3271 } 3272 } 3273 } 3274 3275 if (inst->opcode == BRW_OPCODE_MOV && 3276 inst->dst.file == MRF && 3277 inst->src[0].file == GRF && 3278 !inst->predicated) { 3279 last_mrf_move[inst->dst.hw_reg] = inst; 3280 } 3281 } 3282 3283 return progress; 3284} 3285 3286bool 3287fs_visitor::virtual_grf_interferes(int a, int b) 3288{ 3289 int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]); 3290 int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]); 3291 3292 /* We can't handle dead register writes here, without iterating 3293 * over the whole instruction stream to find every single dead 3294 * write to that register to compare to the live interval of the 3295 * other register. Just assert that dead_code_eliminate() has been 3296 * called. 3297 */ 3298 assert((this->virtual_grf_use[a] != -1 || 3299 this->virtual_grf_def[a] == MAX_INSTRUCTION) && 3300 (this->virtual_grf_use[b] != -1 || 3301 this->virtual_grf_def[b] == MAX_INSTRUCTION)); 3302 3303 return start < end; 3304} 3305 3306static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg) 3307{ 3308 struct brw_reg brw_reg; 3309 3310 switch (reg->file) { 3311 case GRF: 3312 case ARF: 3313 case MRF: 3314 if (reg->smear == -1) { 3315 brw_reg = brw_vec8_reg(reg->file, 3316 reg->hw_reg, 0); 3317 } else { 3318 brw_reg = brw_vec1_reg(reg->file, 3319 reg->hw_reg, reg->smear); 3320 } 3321 brw_reg = retype(brw_reg, reg->type); 3322 break; 3323 case IMM: 3324 switch (reg->type) { 3325 case BRW_REGISTER_TYPE_F: 3326 brw_reg = brw_imm_f(reg->imm.f); 3327 break; 3328 case BRW_REGISTER_TYPE_D: 3329 brw_reg = brw_imm_d(reg->imm.i); 3330 break; 3331 case BRW_REGISTER_TYPE_UD: 3332 brw_reg = brw_imm_ud(reg->imm.u); 3333 break; 3334 default: 3335 assert(!"not reached"); 3336 brw_reg = brw_null_reg(); 3337 break; 3338 } 3339 break; 3340 case FIXED_HW_REG: 3341 brw_reg = reg->fixed_hw_reg; 3342 break; 3343 case BAD_FILE: 3344 /* Probably unused. */ 3345 brw_reg = brw_null_reg(); 3346 break; 3347 case UNIFORM: 3348 assert(!"not reached"); 3349 brw_reg = brw_null_reg(); 3350 break; 3351 default: 3352 assert(!"not reached"); 3353 brw_reg = brw_null_reg(); 3354 break; 3355 } 3356 if (reg->abs) 3357 brw_reg = brw_abs(brw_reg); 3358 if (reg->negate) 3359 brw_reg = negate(brw_reg); 3360 3361 return brw_reg; 3362} 3363 3364void 3365fs_visitor::generate_code() 3366{ 3367 int last_native_inst = 0; 3368 struct brw_instruction *if_stack[16], *loop_stack[16]; 3369 int if_stack_depth = 0, loop_stack_depth = 0; 3370 int if_depth_in_loop[16]; 3371 const char *last_annotation_string = NULL; 3372 ir_instruction *last_annotation_ir = NULL; 3373 3374 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3375 printf("Native code for fragment shader %d:\n", 3376 ctx->Shader.CurrentFragmentProgram->Name); 3377 } 3378 3379 if_depth_in_loop[loop_stack_depth] = 0; 3380 3381 memset(&if_stack, 0, sizeof(if_stack)); 3382 foreach_iter(exec_list_iterator, iter, this->instructions) { 3383 fs_inst *inst = (fs_inst *)iter.get(); 3384 struct brw_reg src[3], dst; 3385 3386 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3387 if (last_annotation_ir != inst->ir) { 3388 last_annotation_ir = inst->ir; 3389 if (last_annotation_ir) { 3390 printf(" "); 3391 last_annotation_ir->print(); 3392 printf("\n"); 3393 } 3394 } 3395 if (last_annotation_string != inst->annotation) { 3396 last_annotation_string = inst->annotation; 3397 if (last_annotation_string) 3398 printf(" %s\n", last_annotation_string); 3399 } 3400 } 3401 3402 for (unsigned int i = 0; i < 3; i++) { 3403 src[i] = brw_reg_from_fs_reg(&inst->src[i]); 3404 } 3405 dst = brw_reg_from_fs_reg(&inst->dst); 3406 3407 brw_set_conditionalmod(p, inst->conditional_mod); 3408 brw_set_predicate_control(p, inst->predicated); 3409 brw_set_saturate(p, inst->saturate); 3410 3411 switch (inst->opcode) { 3412 case BRW_OPCODE_MOV: 3413 brw_MOV(p, dst, src[0]); 3414 break; 3415 case BRW_OPCODE_ADD: 3416 brw_ADD(p, dst, src[0], src[1]); 3417 break; 3418 case BRW_OPCODE_MUL: 3419 brw_MUL(p, dst, src[0], src[1]); 3420 break; 3421 3422 case BRW_OPCODE_FRC: 3423 brw_FRC(p, dst, src[0]); 3424 break; 3425 case BRW_OPCODE_RNDD: 3426 brw_RNDD(p, dst, src[0]); 3427 break; 3428 case BRW_OPCODE_RNDE: 3429 brw_RNDE(p, dst, src[0]); 3430 break; 3431 case BRW_OPCODE_RNDZ: 3432 brw_RNDZ(p, dst, src[0]); 3433 break; 3434 3435 case BRW_OPCODE_AND: 3436 brw_AND(p, dst, src[0], src[1]); 3437 break; 3438 case BRW_OPCODE_OR: 3439 brw_OR(p, dst, src[0], src[1]); 3440 break; 3441 case BRW_OPCODE_XOR: 3442 brw_XOR(p, dst, src[0], src[1]); 3443 break; 3444 case BRW_OPCODE_NOT: 3445 brw_NOT(p, dst, src[0]); 3446 break; 3447 case BRW_OPCODE_ASR: 3448 brw_ASR(p, dst, src[0], src[1]); 3449 break; 3450 case BRW_OPCODE_SHR: 3451 brw_SHR(p, dst, src[0], src[1]); 3452 break; 3453 case BRW_OPCODE_SHL: 3454 brw_SHL(p, dst, src[0], src[1]); 3455 break; 3456 3457 case BRW_OPCODE_CMP: 3458 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); 3459 break; 3460 case BRW_OPCODE_SEL: 3461 brw_SEL(p, dst, src[0], src[1]); 3462 break; 3463 3464 case BRW_OPCODE_IF: 3465 assert(if_stack_depth < 16); 3466 if (inst->src[0].file != BAD_FILE) { 3467 assert(intel->gen >= 6); 3468 if_stack[if_stack_depth] = brw_IF_gen6(p, inst->conditional_mod, src[0], src[1]); 3469 } else { 3470 if_stack[if_stack_depth] = brw_IF(p, BRW_EXECUTE_8); 3471 } 3472 if_depth_in_loop[loop_stack_depth]++; 3473 if_stack_depth++; 3474 break; 3475 3476 case BRW_OPCODE_ELSE: 3477 if_stack[if_stack_depth - 1] = 3478 brw_ELSE(p, if_stack[if_stack_depth - 1]); 3479 break; 3480 case BRW_OPCODE_ENDIF: 3481 if_stack_depth--; 3482 brw_ENDIF(p , if_stack[if_stack_depth]); 3483 if_depth_in_loop[loop_stack_depth]--; 3484 break; 3485 3486 case BRW_OPCODE_DO: 3487 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8); 3488 if_depth_in_loop[loop_stack_depth] = 0; 3489 break; 3490 3491 case BRW_OPCODE_BREAK: 3492 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]); 3493 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 3494 break; 3495 case BRW_OPCODE_CONTINUE: 3496 /* FINISHME: We need to write the loop instruction support still. */ 3497 if (intel->gen >= 6) 3498 brw_CONT_gen6(p, loop_stack[loop_stack_depth - 1]); 3499 else 3500 brw_CONT(p, if_depth_in_loop[loop_stack_depth]); 3501 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 3502 break; 3503 3504 case BRW_OPCODE_WHILE: { 3505 struct brw_instruction *inst0, *inst1; 3506 GLuint br = 1; 3507 3508 if (intel->gen >= 5) 3509 br = 2; 3510 3511 assert(loop_stack_depth > 0); 3512 loop_stack_depth--; 3513 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]); 3514 if (intel->gen < 6) { 3515 /* patch all the BREAK/CONT instructions from last BGNLOOP */ 3516 while (inst0 > loop_stack[loop_stack_depth]) { 3517 inst0--; 3518 if (inst0->header.opcode == BRW_OPCODE_BREAK && 3519 inst0->bits3.if_else.jump_count == 0) { 3520 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1); 3521 } 3522 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE && 3523 inst0->bits3.if_else.jump_count == 0) { 3524 inst0->bits3.if_else.jump_count = br * (inst1 - inst0); 3525 } 3526 } 3527 } 3528 } 3529 break; 3530 3531 case FS_OPCODE_RCP: 3532 case FS_OPCODE_RSQ: 3533 case FS_OPCODE_SQRT: 3534 case FS_OPCODE_EXP2: 3535 case FS_OPCODE_LOG2: 3536 case FS_OPCODE_POW: 3537 case FS_OPCODE_SIN: 3538 case FS_OPCODE_COS: 3539 generate_math(inst, dst, src); 3540 break; 3541 case FS_OPCODE_CINTERP: 3542 brw_MOV(p, dst, src[0]); 3543 break; 3544 case FS_OPCODE_LINTERP: 3545 generate_linterp(inst, dst, src); 3546 break; 3547 case FS_OPCODE_TEX: 3548 case FS_OPCODE_TXB: 3549 case FS_OPCODE_TXL: 3550 generate_tex(inst, dst); 3551 break; 3552 case FS_OPCODE_DISCARD_NOT: 3553 generate_discard_not(inst, dst); 3554 break; 3555 case FS_OPCODE_DISCARD_AND: 3556 generate_discard_and(inst, src[0]); 3557 break; 3558 case FS_OPCODE_DDX: 3559 generate_ddx(inst, dst, src[0]); 3560 break; 3561 case FS_OPCODE_DDY: 3562 generate_ddy(inst, dst, src[0]); 3563 break; 3564 3565 case FS_OPCODE_SPILL: 3566 generate_spill(inst, src[0]); 3567 break; 3568 3569 case FS_OPCODE_UNSPILL: 3570 generate_unspill(inst, dst); 3571 break; 3572 3573 case FS_OPCODE_PULL_CONSTANT_LOAD: 3574 generate_pull_constant_load(inst, dst); 3575 break; 3576 3577 case FS_OPCODE_FB_WRITE: 3578 generate_fb_write(inst); 3579 break; 3580 default: 3581 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) { 3582 _mesa_problem(ctx, "Unsupported opcode `%s' in FS", 3583 brw_opcodes[inst->opcode].name); 3584 } else { 3585 _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode); 3586 } 3587 this->fail = true; 3588 } 3589 3590 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3591 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) { 3592 if (0) { 3593 printf("0x%08x 0x%08x 0x%08x 0x%08x ", 3594 ((uint32_t *)&p->store[i])[3], 3595 ((uint32_t *)&p->store[i])[2], 3596 ((uint32_t *)&p->store[i])[1], 3597 ((uint32_t *)&p->store[i])[0]); 3598 } 3599 brw_disasm(stdout, &p->store[i], intel->gen); 3600 } 3601 } 3602 3603 last_native_inst = p->nr_insn; 3604 } 3605 3606 brw_set_uip_jip(p); 3607 3608 /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS 3609 * emit issues, it doesn't get the jump distances into the output, 3610 * which is often something we want to debug. So this is here in 3611 * case you're doing that. 3612 */ 3613 if (0) { 3614 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3615 for (unsigned int i = 0; i < p->nr_insn; i++) { 3616 printf("0x%08x 0x%08x 0x%08x 0x%08x ", 3617 ((uint32_t *)&p->store[i])[3], 3618 ((uint32_t *)&p->store[i])[2], 3619 ((uint32_t *)&p->store[i])[1], 3620 ((uint32_t *)&p->store[i])[0]); 3621 brw_disasm(stdout, &p->store[i], intel->gen); 3622 } 3623 } 3624 } 3625} 3626 3627GLboolean 3628brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c) 3629{ 3630 struct intel_context *intel = &brw->intel; 3631 struct gl_context *ctx = &intel->ctx; 3632 struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram; 3633 3634 if (!prog) 3635 return GL_FALSE; 3636 3637 struct brw_shader *shader = 3638 (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; 3639 if (!shader) 3640 return GL_FALSE; 3641 3642 /* We always use 8-wide mode, at least for now. For one, flow 3643 * control only works in 8-wide. Also, when we're fragment shader 3644 * bound, we're almost always under register pressure as well, so 3645 * 8-wide would save us from the performance cliff of spilling 3646 * regs. 3647 */ 3648 c->dispatch_width = 8; 3649 3650 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3651 printf("GLSL IR for native fragment shader %d:\n", prog->Name); 3652 _mesa_print_ir(shader->ir, NULL); 3653 printf("\n"); 3654 } 3655 3656 /* Now the main event: Visit the shader IR and generate our FS IR for it. 3657 */ 3658 fs_visitor v(c, shader); 3659 3660 if (0) { 3661 v.emit_dummy_fs(); 3662 } else { 3663 v.calculate_urb_setup(); 3664 if (intel->gen < 6) 3665 v.emit_interpolation_setup_gen4(); 3666 else 3667 v.emit_interpolation_setup_gen6(); 3668 3669 /* Generate FS IR for main(). (the visitor only descends into 3670 * functions called "main"). 3671 */ 3672 foreach_iter(exec_list_iterator, iter, *shader->ir) { 3673 ir_instruction *ir = (ir_instruction *)iter.get(); 3674 v.base_ir = ir; 3675 ir->accept(&v); 3676 } 3677 3678 v.emit_fb_writes(); 3679 3680 v.split_virtual_grfs(); 3681 3682 v.setup_paramvalues_refs(); 3683 v.setup_pull_constants(); 3684 3685 bool progress; 3686 do { 3687 progress = false; 3688 3689 progress = v.remove_duplicate_mrf_writes() || progress; 3690 3691 progress = v.propagate_constants() || progress; 3692 progress = v.register_coalesce() || progress; 3693 progress = v.compute_to_mrf() || progress; 3694 progress = v.dead_code_eliminate() || progress; 3695 } while (progress); 3696 3697 v.schedule_instructions(); 3698 3699 v.assign_curb_setup(); 3700 v.assign_urb_setup(); 3701 3702 if (0) { 3703 /* Debug of register spilling: Go spill everything. */ 3704 int virtual_grf_count = v.virtual_grf_next; 3705 for (int i = 1; i < virtual_grf_count; i++) { 3706 v.spill_reg(i); 3707 } 3708 } 3709 3710 if (0) 3711 v.assign_regs_trivial(); 3712 else { 3713 while (!v.assign_regs()) { 3714 if (v.fail) 3715 break; 3716 } 3717 } 3718 } 3719 3720 if (!v.fail) 3721 v.generate_code(); 3722 3723 assert(!v.fail); /* FINISHME: Cleanly fail, tested at link time, etc. */ 3724 3725 if (v.fail) 3726 return GL_FALSE; 3727 3728 c->prog_data.total_grf = v.grf_used; 3729 3730 return GL_TRUE; 3731} 3732