brw_fs.cpp revision 58f7c9c72ee52527610b26ca8a137dd88c082c89
1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Eric Anholt <eric@anholt.net> 25 * 26 */ 27 28extern "C" { 29 30#include <sys/types.h> 31 32#include "main/macros.h" 33#include "main/shaderobj.h" 34#include "main/uniforms.h" 35#include "program/prog_parameter.h" 36#include "program/prog_print.h" 37#include "program/prog_optimize.h" 38#include "program/register_allocate.h" 39#include "program/sampler.h" 40#include "program/hash_table.h" 41#include "brw_context.h" 42#include "brw_eu.h" 43#include "brw_wm.h" 44} 45#include "brw_fs.h" 46#include "../glsl/glsl_types.h" 47#include "../glsl/ir_optimization.h" 48#include "../glsl/ir_print_visitor.h" 49 50#define MAX_INSTRUCTION (1 << 30) 51static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg); 52 53struct gl_shader * 54brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type) 55{ 56 struct brw_shader *shader; 57 58 shader = rzalloc(NULL, struct brw_shader); 59 if (shader) { 60 shader->base.Type = type; 61 shader->base.Name = name; 62 _mesa_init_shader(ctx, &shader->base); 63 } 64 65 return &shader->base; 66} 67 68struct gl_shader_program * 69brw_new_shader_program(struct gl_context *ctx, GLuint name) 70{ 71 struct brw_shader_program *prog; 72 prog = rzalloc(NULL, struct brw_shader_program); 73 if (prog) { 74 prog->base.Name = name; 75 _mesa_init_shader_program(ctx, &prog->base); 76 } 77 return &prog->base; 78} 79 80GLboolean 81brw_compile_shader(struct gl_context *ctx, struct gl_shader *shader) 82{ 83 if (!_mesa_ir_compile_shader(ctx, shader)) 84 return GL_FALSE; 85 86 return GL_TRUE; 87} 88 89GLboolean 90brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog) 91{ 92 struct brw_context *brw = brw_context(ctx); 93 struct intel_context *intel = &brw->intel; 94 95 struct brw_shader *shader = 96 (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; 97 if (shader != NULL) { 98 void *mem_ctx = ralloc_context(NULL); 99 bool progress; 100 101 if (shader->ir) 102 ralloc_free(shader->ir); 103 shader->ir = new(shader) exec_list; 104 clone_ir_list(mem_ctx, shader->ir, shader->base.ir); 105 106 do_mat_op_to_vec(shader->ir); 107 lower_instructions(shader->ir, 108 MOD_TO_FRACT | 109 DIV_TO_MUL_RCP | 110 SUB_TO_ADD_NEG | 111 EXP_TO_EXP2 | 112 LOG_TO_LOG2); 113 114 /* Pre-gen6 HW can only nest if-statements 16 deep. Beyond this, 115 * if-statements need to be flattened. 116 */ 117 if (intel->gen < 6) 118 lower_if_to_cond_assign(shader->ir, 16); 119 120 do_lower_texture_projection(shader->ir); 121 do_vec_index_to_cond_assign(shader->ir); 122 brw_do_cubemap_normalize(shader->ir); 123 124 do { 125 progress = false; 126 127 brw_do_channel_expressions(shader->ir); 128 brw_do_vector_splitting(shader->ir); 129 130 progress = do_lower_jumps(shader->ir, true, true, 131 true, /* main return */ 132 false, /* continue */ 133 false /* loops */ 134 ) || progress; 135 136 progress = do_common_optimization(shader->ir, true, 32) || progress; 137 138 progress = lower_noise(shader->ir) || progress; 139 progress = 140 lower_variable_index_to_cond_assign(shader->ir, 141 GL_TRUE, /* input */ 142 GL_TRUE, /* output */ 143 GL_TRUE, /* temp */ 144 GL_TRUE /* uniform */ 145 ) || progress; 146 progress = lower_quadop_vector(shader->ir, false) || progress; 147 } while (progress); 148 149 validate_ir_tree(shader->ir); 150 151 reparent_ir(shader->ir, shader->ir); 152 ralloc_free(mem_ctx); 153 } 154 155 if (!_mesa_ir_link_shader(ctx, prog)) 156 return GL_FALSE; 157 158 return GL_TRUE; 159} 160 161static int 162type_size(const struct glsl_type *type) 163{ 164 unsigned int size, i; 165 166 switch (type->base_type) { 167 case GLSL_TYPE_UINT: 168 case GLSL_TYPE_INT: 169 case GLSL_TYPE_FLOAT: 170 case GLSL_TYPE_BOOL: 171 return type->components(); 172 case GLSL_TYPE_ARRAY: 173 return type_size(type->fields.array) * type->length; 174 case GLSL_TYPE_STRUCT: 175 size = 0; 176 for (i = 0; i < type->length; i++) { 177 size += type_size(type->fields.structure[i].type); 178 } 179 return size; 180 case GLSL_TYPE_SAMPLER: 181 /* Samplers take up no register space, since they're baked in at 182 * link time. 183 */ 184 return 0; 185 default: 186 assert(!"not reached"); 187 return 0; 188 } 189} 190 191/** 192 * Returns how many MRFs an FS opcode will write over. 193 * 194 * Note that this is not the 0 or 1 implied writes in an actual gen 195 * instruction -- the FS opcodes often generate MOVs in addition. 196 */ 197int 198fs_visitor::implied_mrf_writes(fs_inst *inst) 199{ 200 if (inst->mlen == 0) 201 return 0; 202 203 switch (inst->opcode) { 204 case FS_OPCODE_RCP: 205 case FS_OPCODE_RSQ: 206 case FS_OPCODE_SQRT: 207 case FS_OPCODE_EXP2: 208 case FS_OPCODE_LOG2: 209 case FS_OPCODE_SIN: 210 case FS_OPCODE_COS: 211 return 1; 212 case FS_OPCODE_POW: 213 return 2; 214 case FS_OPCODE_TEX: 215 case FS_OPCODE_TXB: 216 case FS_OPCODE_TXD: 217 case FS_OPCODE_TXL: 218 return 1; 219 case FS_OPCODE_FB_WRITE: 220 return 2; 221 case FS_OPCODE_PULL_CONSTANT_LOAD: 222 case FS_OPCODE_UNSPILL: 223 return 1; 224 case FS_OPCODE_SPILL: 225 return 2; 226 default: 227 assert(!"not reached"); 228 return inst->mlen; 229 } 230} 231 232int 233fs_visitor::virtual_grf_alloc(int size) 234{ 235 if (virtual_grf_array_size <= virtual_grf_next) { 236 if (virtual_grf_array_size == 0) 237 virtual_grf_array_size = 16; 238 else 239 virtual_grf_array_size *= 2; 240 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int, 241 virtual_grf_array_size); 242 243 /* This slot is always unused. */ 244 virtual_grf_sizes[0] = 0; 245 } 246 virtual_grf_sizes[virtual_grf_next] = size; 247 return virtual_grf_next++; 248} 249 250/** Fixed HW reg constructor. */ 251fs_reg::fs_reg(enum register_file file, int hw_reg) 252{ 253 init(); 254 this->file = file; 255 this->hw_reg = hw_reg; 256 this->type = BRW_REGISTER_TYPE_F; 257} 258 259/** Fixed HW reg constructor. */ 260fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type) 261{ 262 init(); 263 this->file = file; 264 this->hw_reg = hw_reg; 265 this->type = type; 266} 267 268int 269brw_type_for_base_type(const struct glsl_type *type) 270{ 271 switch (type->base_type) { 272 case GLSL_TYPE_FLOAT: 273 return BRW_REGISTER_TYPE_F; 274 case GLSL_TYPE_INT: 275 case GLSL_TYPE_BOOL: 276 return BRW_REGISTER_TYPE_D; 277 case GLSL_TYPE_UINT: 278 return BRW_REGISTER_TYPE_UD; 279 case GLSL_TYPE_ARRAY: 280 case GLSL_TYPE_STRUCT: 281 case GLSL_TYPE_SAMPLER: 282 /* These should be overridden with the type of the member when 283 * dereferenced into. BRW_REGISTER_TYPE_UD seems like a likely 284 * way to trip up if we don't. 285 */ 286 return BRW_REGISTER_TYPE_UD; 287 default: 288 assert(!"not reached"); 289 return BRW_REGISTER_TYPE_F; 290 } 291} 292 293/** Automatic reg constructor. */ 294fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type) 295{ 296 init(); 297 298 this->file = GRF; 299 this->reg = v->virtual_grf_alloc(type_size(type)); 300 this->reg_offset = 0; 301 this->type = brw_type_for_base_type(type); 302} 303 304fs_reg * 305fs_visitor::variable_storage(ir_variable *var) 306{ 307 return (fs_reg *)hash_table_find(this->variable_ht, var); 308} 309 310/* Our support for uniforms is piggy-backed on the struct 311 * gl_fragment_program, because that's where the values actually 312 * get stored, rather than in some global gl_shader_program uniform 313 * store. 314 */ 315int 316fs_visitor::setup_uniform_values(int loc, const glsl_type *type) 317{ 318 unsigned int offset = 0; 319 320 if (type->is_matrix()) { 321 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT, 322 type->vector_elements, 323 1); 324 325 for (unsigned int i = 0; i < type->matrix_columns; i++) { 326 offset += setup_uniform_values(loc + offset, column); 327 } 328 329 return offset; 330 } 331 332 switch (type->base_type) { 333 case GLSL_TYPE_FLOAT: 334 case GLSL_TYPE_UINT: 335 case GLSL_TYPE_INT: 336 case GLSL_TYPE_BOOL: 337 for (unsigned int i = 0; i < type->vector_elements; i++) { 338 unsigned int param = c->prog_data.nr_params++; 339 340 assert(param < ARRAY_SIZE(c->prog_data.param)); 341 342 switch (type->base_type) { 343 case GLSL_TYPE_FLOAT: 344 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 345 break; 346 case GLSL_TYPE_UINT: 347 c->prog_data.param_convert[param] = PARAM_CONVERT_F2U; 348 break; 349 case GLSL_TYPE_INT: 350 c->prog_data.param_convert[param] = PARAM_CONVERT_F2I; 351 break; 352 case GLSL_TYPE_BOOL: 353 c->prog_data.param_convert[param] = PARAM_CONVERT_F2B; 354 break; 355 default: 356 assert(!"not reached"); 357 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 358 break; 359 } 360 this->param_index[param] = loc; 361 this->param_offset[param] = i; 362 } 363 return 1; 364 365 case GLSL_TYPE_STRUCT: 366 for (unsigned int i = 0; i < type->length; i++) { 367 offset += setup_uniform_values(loc + offset, 368 type->fields.structure[i].type); 369 } 370 return offset; 371 372 case GLSL_TYPE_ARRAY: 373 for (unsigned int i = 0; i < type->length; i++) { 374 offset += setup_uniform_values(loc + offset, type->fields.array); 375 } 376 return offset; 377 378 case GLSL_TYPE_SAMPLER: 379 /* The sampler takes up a slot, but we don't use any values from it. */ 380 return 1; 381 382 default: 383 assert(!"not reached"); 384 return 0; 385 } 386} 387 388 389/* Our support for builtin uniforms is even scarier than non-builtin. 390 * It sits on top of the PROG_STATE_VAR parameters that are 391 * automatically updated from GL context state. 392 */ 393void 394fs_visitor::setup_builtin_uniform_values(ir_variable *ir) 395{ 396 const struct gl_builtin_uniform_desc *statevar = NULL; 397 398 for (unsigned int i = 0; _mesa_builtin_uniform_desc[i].name; i++) { 399 statevar = &_mesa_builtin_uniform_desc[i]; 400 if (strcmp(ir->name, _mesa_builtin_uniform_desc[i].name) == 0) 401 break; 402 } 403 404 if (!statevar->name) { 405 this->fail = true; 406 printf("Failed to find builtin uniform `%s'\n", ir->name); 407 return; 408 } 409 410 int array_count; 411 if (ir->type->is_array()) { 412 array_count = ir->type->length; 413 } else { 414 array_count = 1; 415 } 416 417 for (int a = 0; a < array_count; a++) { 418 for (unsigned int i = 0; i < statevar->num_elements; i++) { 419 struct gl_builtin_uniform_element *element = &statevar->elements[i]; 420 int tokens[STATE_LENGTH]; 421 422 memcpy(tokens, element->tokens, sizeof(element->tokens)); 423 if (ir->type->is_array()) { 424 tokens[1] = a; 425 } 426 427 /* This state reference has already been setup by ir_to_mesa, 428 * but we'll get the same index back here. 429 */ 430 int index = _mesa_add_state_reference(this->fp->Base.Parameters, 431 (gl_state_index *)tokens); 432 433 /* Add each of the unique swizzles of the element as a 434 * parameter. This'll end up matching the expected layout of 435 * the array/matrix/structure we're trying to fill in. 436 */ 437 int last_swiz = -1; 438 for (unsigned int i = 0; i < 4; i++) { 439 int swiz = GET_SWZ(element->swizzle, i); 440 if (swiz == last_swiz) 441 break; 442 last_swiz = swiz; 443 444 c->prog_data.param_convert[c->prog_data.nr_params] = 445 PARAM_NO_CONVERT; 446 this->param_index[c->prog_data.nr_params] = index; 447 this->param_offset[c->prog_data.nr_params] = swiz; 448 c->prog_data.nr_params++; 449 } 450 } 451 } 452} 453 454fs_reg * 455fs_visitor::emit_fragcoord_interpolation(ir_variable *ir) 456{ 457 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 458 fs_reg wpos = *reg; 459 fs_reg neg_y = this->pixel_y; 460 neg_y.negate = true; 461 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo; 462 463 /* gl_FragCoord.x */ 464 if (ir->pixel_center_integer) { 465 emit(fs_inst(BRW_OPCODE_MOV, wpos, this->pixel_x)); 466 } else { 467 emit(fs_inst(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f))); 468 } 469 wpos.reg_offset++; 470 471 /* gl_FragCoord.y */ 472 if (!flip && ir->pixel_center_integer) { 473 emit(fs_inst(BRW_OPCODE_MOV, wpos, this->pixel_y)); 474 } else { 475 fs_reg pixel_y = this->pixel_y; 476 float offset = (ir->pixel_center_integer ? 0.0 : 0.5); 477 478 if (flip) { 479 pixel_y.negate = true; 480 offset += c->key.drawable_height - 1.0; 481 } 482 483 emit(fs_inst(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset))); 484 } 485 wpos.reg_offset++; 486 487 /* gl_FragCoord.z */ 488 if (intel->gen >= 6) { 489 emit(fs_inst(BRW_OPCODE_MOV, wpos, 490 fs_reg(brw_vec8_grf(c->source_depth_reg, 0)))); 491 } else { 492 emit(fs_inst(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y, 493 interp_reg(FRAG_ATTRIB_WPOS, 2))); 494 } 495 wpos.reg_offset++; 496 497 /* gl_FragCoord.w: Already set up in emit_interpolation */ 498 emit(fs_inst(BRW_OPCODE_MOV, wpos, this->pixel_w)); 499 500 return reg; 501} 502 503fs_reg * 504fs_visitor::emit_general_interpolation(ir_variable *ir) 505{ 506 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 507 /* Interpolation is always in floating point regs. */ 508 reg->type = BRW_REGISTER_TYPE_F; 509 fs_reg attr = *reg; 510 511 unsigned int array_elements; 512 const glsl_type *type; 513 514 if (ir->type->is_array()) { 515 array_elements = ir->type->length; 516 if (array_elements == 0) { 517 this->fail = true; 518 } 519 type = ir->type->fields.array; 520 } else { 521 array_elements = 1; 522 type = ir->type; 523 } 524 525 int location = ir->location; 526 for (unsigned int i = 0; i < array_elements; i++) { 527 for (unsigned int j = 0; j < type->matrix_columns; j++) { 528 if (urb_setup[location] == -1) { 529 /* If there's no incoming setup data for this slot, don't 530 * emit interpolation for it. 531 */ 532 attr.reg_offset += type->vector_elements; 533 location++; 534 continue; 535 } 536 537 if (c->key.flat_shade && (location == FRAG_ATTRIB_COL0 || 538 location == FRAG_ATTRIB_COL1)) { 539 /* Constant interpolation (flat shading) case. The SF has 540 * handed us defined values in only the constant offset 541 * field of the setup reg. 542 */ 543 for (unsigned int c = 0; c < type->vector_elements; c++) { 544 struct brw_reg interp = interp_reg(location, c); 545 interp = suboffset(interp, 3); 546 emit(fs_inst(FS_OPCODE_CINTERP, attr, fs_reg(interp))); 547 attr.reg_offset++; 548 } 549 } else { 550 /* Perspective interpolation case. */ 551 for (unsigned int c = 0; c < type->vector_elements; c++) { 552 struct brw_reg interp = interp_reg(location, c); 553 emit(fs_inst(FS_OPCODE_LINTERP, 554 attr, 555 this->delta_x, 556 this->delta_y, 557 fs_reg(interp))); 558 attr.reg_offset++; 559 } 560 561 if (intel->gen < 6) { 562 attr.reg_offset -= type->vector_elements; 563 for (unsigned int c = 0; c < type->vector_elements; c++) { 564 emit(fs_inst(BRW_OPCODE_MUL, 565 attr, 566 attr, 567 this->pixel_w)); 568 attr.reg_offset++; 569 } 570 } 571 } 572 location++; 573 } 574 } 575 576 return reg; 577} 578 579fs_reg * 580fs_visitor::emit_frontfacing_interpolation(ir_variable *ir) 581{ 582 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 583 584 /* The frontfacing comes in as a bit in the thread payload. */ 585 if (intel->gen >= 6) { 586 emit(fs_inst(BRW_OPCODE_ASR, 587 *reg, 588 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)), 589 fs_reg(15))); 590 emit(fs_inst(BRW_OPCODE_NOT, 591 *reg, 592 *reg)); 593 emit(fs_inst(BRW_OPCODE_AND, 594 *reg, 595 *reg, 596 fs_reg(1))); 597 } else { 598 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); 599 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives 600 * us front face 601 */ 602 fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, 603 *reg, 604 fs_reg(r1_6ud), 605 fs_reg(1u << 31))); 606 inst->conditional_mod = BRW_CONDITIONAL_L; 607 emit(fs_inst(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u))); 608 } 609 610 return reg; 611} 612 613fs_inst * 614fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src) 615{ 616 switch (opcode) { 617 case FS_OPCODE_RCP: 618 case FS_OPCODE_RSQ: 619 case FS_OPCODE_SQRT: 620 case FS_OPCODE_EXP2: 621 case FS_OPCODE_LOG2: 622 case FS_OPCODE_SIN: 623 case FS_OPCODE_COS: 624 break; 625 default: 626 assert(!"not reached: bad math opcode"); 627 return NULL; 628 } 629 630 /* Can't do hstride == 0 args to gen6 math, so expand it out. We 631 * might be able to do better by doing execsize = 1 math and then 632 * expanding that result out, but we would need to be careful with 633 * masking. 634 * 635 * The hardware ignores source modifiers (negate and abs) on math 636 * instructions, so we also move to a temp to set those up. 637 */ 638 if (intel->gen >= 6 && (src.file == UNIFORM || 639 src.abs || 640 src.negate)) { 641 fs_reg expanded = fs_reg(this, glsl_type::float_type); 642 emit(fs_inst(BRW_OPCODE_MOV, expanded, src)); 643 src = expanded; 644 } 645 646 fs_inst *inst = emit(fs_inst(opcode, dst, src)); 647 648 if (intel->gen < 6) { 649 inst->base_mrf = 2; 650 inst->mlen = 1; 651 } 652 653 return inst; 654} 655 656fs_inst * 657fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1) 658{ 659 int base_mrf = 2; 660 fs_inst *inst; 661 662 assert(opcode == FS_OPCODE_POW); 663 664 if (intel->gen >= 6) { 665 /* Can't do hstride == 0 args to gen6 math, so expand it out. 666 * 667 * The hardware ignores source modifiers (negate and abs) on math 668 * instructions, so we also move to a temp to set those up. 669 */ 670 if (src0.file == UNIFORM || src0.abs || src0.negate) { 671 fs_reg expanded = fs_reg(this, glsl_type::float_type); 672 emit(fs_inst(BRW_OPCODE_MOV, expanded, src0)); 673 src0 = expanded; 674 } 675 676 if (src1.file == UNIFORM || src1.abs || src1.negate) { 677 fs_reg expanded = fs_reg(this, glsl_type::float_type); 678 emit(fs_inst(BRW_OPCODE_MOV, expanded, src1)); 679 src1 = expanded; 680 } 681 682 inst = emit(fs_inst(opcode, dst, src0, src1)); 683 } else { 684 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1)); 685 inst = emit(fs_inst(opcode, dst, src0, reg_null_f)); 686 687 inst->base_mrf = base_mrf; 688 inst->mlen = 2; 689 } 690 return inst; 691} 692 693void 694fs_visitor::visit(ir_variable *ir) 695{ 696 fs_reg *reg = NULL; 697 698 if (variable_storage(ir)) 699 return; 700 701 if (strcmp(ir->name, "gl_FragColor") == 0) { 702 this->frag_color = ir; 703 } else if (strcmp(ir->name, "gl_FragData") == 0) { 704 this->frag_data = ir; 705 } else if (strcmp(ir->name, "gl_FragDepth") == 0) { 706 this->frag_depth = ir; 707 } 708 709 if (ir->mode == ir_var_in) { 710 if (!strcmp(ir->name, "gl_FragCoord")) { 711 reg = emit_fragcoord_interpolation(ir); 712 } else if (!strcmp(ir->name, "gl_FrontFacing")) { 713 reg = emit_frontfacing_interpolation(ir); 714 } else { 715 reg = emit_general_interpolation(ir); 716 } 717 assert(reg); 718 hash_table_insert(this->variable_ht, reg, ir); 719 return; 720 } 721 722 if (ir->mode == ir_var_uniform) { 723 int param_index = c->prog_data.nr_params; 724 725 if (!strncmp(ir->name, "gl_", 3)) { 726 setup_builtin_uniform_values(ir); 727 } else { 728 setup_uniform_values(ir->location, ir->type); 729 } 730 731 reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index); 732 reg->type = brw_type_for_base_type(ir->type); 733 } 734 735 if (!reg) 736 reg = new(this->mem_ctx) fs_reg(this, ir->type); 737 738 hash_table_insert(this->variable_ht, reg, ir); 739} 740 741void 742fs_visitor::visit(ir_dereference_variable *ir) 743{ 744 fs_reg *reg = variable_storage(ir->var); 745 this->result = *reg; 746} 747 748void 749fs_visitor::visit(ir_dereference_record *ir) 750{ 751 const glsl_type *struct_type = ir->record->type; 752 753 ir->record->accept(this); 754 755 unsigned int offset = 0; 756 for (unsigned int i = 0; i < struct_type->length; i++) { 757 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0) 758 break; 759 offset += type_size(struct_type->fields.structure[i].type); 760 } 761 this->result.reg_offset += offset; 762 this->result.type = brw_type_for_base_type(ir->type); 763} 764 765void 766fs_visitor::visit(ir_dereference_array *ir) 767{ 768 ir_constant *index; 769 int element_size; 770 771 ir->array->accept(this); 772 index = ir->array_index->as_constant(); 773 774 element_size = type_size(ir->type); 775 this->result.type = brw_type_for_base_type(ir->type); 776 777 if (index) { 778 assert(this->result.file == UNIFORM || 779 (this->result.file == GRF && 780 this->result.reg != 0)); 781 this->result.reg_offset += index->value.i[0] * element_size; 782 } else { 783 assert(!"FINISHME: non-constant array element"); 784 } 785} 786 787/* Instruction selection: Produce a MOV.sat instead of 788 * MIN(MAX(val, 0), 1) when possible. 789 */ 790bool 791fs_visitor::try_emit_saturate(ir_expression *ir) 792{ 793 ir_rvalue *sat_val = ir->as_rvalue_to_saturate(); 794 795 if (!sat_val) 796 return false; 797 798 sat_val->accept(this); 799 fs_reg src = this->result; 800 801 this->result = fs_reg(this, ir->type); 802 fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, src)); 803 inst->saturate = true; 804 805 return true; 806} 807 808static uint32_t 809brw_conditional_for_comparison(unsigned int op) 810{ 811 switch (op) { 812 case ir_binop_less: 813 return BRW_CONDITIONAL_L; 814 case ir_binop_greater: 815 return BRW_CONDITIONAL_G; 816 case ir_binop_lequal: 817 return BRW_CONDITIONAL_LE; 818 case ir_binop_gequal: 819 return BRW_CONDITIONAL_GE; 820 case ir_binop_equal: 821 case ir_binop_all_equal: /* same as equal for scalars */ 822 return BRW_CONDITIONAL_Z; 823 case ir_binop_nequal: 824 case ir_binop_any_nequal: /* same as nequal for scalars */ 825 return BRW_CONDITIONAL_NZ; 826 default: 827 assert(!"not reached: bad operation for comparison"); 828 return BRW_CONDITIONAL_NZ; 829 } 830} 831 832void 833fs_visitor::visit(ir_expression *ir) 834{ 835 unsigned int operand; 836 fs_reg op[2], temp; 837 fs_inst *inst; 838 839 assert(ir->get_num_operands() <= 2); 840 841 if (try_emit_saturate(ir)) 842 return; 843 844 for (operand = 0; operand < ir->get_num_operands(); operand++) { 845 ir->operands[operand]->accept(this); 846 if (this->result.file == BAD_FILE) { 847 ir_print_visitor v; 848 printf("Failed to get tree for expression operand:\n"); 849 ir->operands[operand]->accept(&v); 850 this->fail = true; 851 } 852 op[operand] = this->result; 853 854 /* Matrix expression operands should have been broken down to vector 855 * operations already. 856 */ 857 assert(!ir->operands[operand]->type->is_matrix()); 858 /* And then those vector operands should have been broken down to scalar. 859 */ 860 assert(!ir->operands[operand]->type->is_vector()); 861 } 862 863 /* Storage for our result. If our result goes into an assignment, it will 864 * just get copy-propagated out, so no worries. 865 */ 866 this->result = fs_reg(this, ir->type); 867 868 switch (ir->operation) { 869 case ir_unop_logic_not: 870 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is 871 * ones complement of the whole register, not just bit 0. 872 */ 873 emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1))); 874 break; 875 case ir_unop_neg: 876 op[0].negate = !op[0].negate; 877 this->result = op[0]; 878 break; 879 case ir_unop_abs: 880 op[0].abs = true; 881 op[0].negate = false; 882 this->result = op[0]; 883 break; 884 case ir_unop_sign: 885 temp = fs_reg(this, ir->type); 886 887 emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(0.0f))); 888 889 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f))); 890 inst->conditional_mod = BRW_CONDITIONAL_G; 891 inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(1.0f))); 892 inst->predicated = true; 893 894 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f))); 895 inst->conditional_mod = BRW_CONDITIONAL_L; 896 inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f))); 897 inst->predicated = true; 898 899 break; 900 case ir_unop_rcp: 901 emit_math(FS_OPCODE_RCP, this->result, op[0]); 902 break; 903 904 case ir_unop_exp2: 905 emit_math(FS_OPCODE_EXP2, this->result, op[0]); 906 break; 907 case ir_unop_log2: 908 emit_math(FS_OPCODE_LOG2, this->result, op[0]); 909 break; 910 case ir_unop_exp: 911 case ir_unop_log: 912 assert(!"not reached: should be handled by ir_explog_to_explog2"); 913 break; 914 case ir_unop_sin: 915 case ir_unop_sin_reduced: 916 emit_math(FS_OPCODE_SIN, this->result, op[0]); 917 break; 918 case ir_unop_cos: 919 case ir_unop_cos_reduced: 920 emit_math(FS_OPCODE_COS, this->result, op[0]); 921 break; 922 923 case ir_unop_dFdx: 924 emit(fs_inst(FS_OPCODE_DDX, this->result, op[0])); 925 break; 926 case ir_unop_dFdy: 927 emit(fs_inst(FS_OPCODE_DDY, this->result, op[0])); 928 break; 929 930 case ir_binop_add: 931 emit(fs_inst(BRW_OPCODE_ADD, this->result, op[0], op[1])); 932 break; 933 case ir_binop_sub: 934 assert(!"not reached: should be handled by ir_sub_to_add_neg"); 935 break; 936 937 case ir_binop_mul: 938 emit(fs_inst(BRW_OPCODE_MUL, this->result, op[0], op[1])); 939 break; 940 case ir_binop_div: 941 assert(!"not reached: should be handled by ir_div_to_mul_rcp"); 942 break; 943 case ir_binop_mod: 944 assert(!"ir_binop_mod should have been converted to b * fract(a/b)"); 945 break; 946 947 case ir_binop_less: 948 case ir_binop_greater: 949 case ir_binop_lequal: 950 case ir_binop_gequal: 951 case ir_binop_equal: 952 case ir_binop_all_equal: 953 case ir_binop_nequal: 954 case ir_binop_any_nequal: 955 temp = this->result; 956 /* original gen4 does implicit conversion before comparison. */ 957 if (intel->gen < 5) 958 temp.type = op[0].type; 959 960 inst = emit(fs_inst(BRW_OPCODE_CMP, temp, op[0], op[1])); 961 inst->conditional_mod = brw_conditional_for_comparison(ir->operation); 962 emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1))); 963 break; 964 965 case ir_binop_logic_xor: 966 emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], op[1])); 967 break; 968 969 case ir_binop_logic_or: 970 emit(fs_inst(BRW_OPCODE_OR, this->result, op[0], op[1])); 971 break; 972 973 case ir_binop_logic_and: 974 emit(fs_inst(BRW_OPCODE_AND, this->result, op[0], op[1])); 975 break; 976 977 case ir_binop_dot: 978 case ir_unop_any: 979 assert(!"not reached: should be handled by brw_fs_channel_expressions"); 980 break; 981 982 case ir_unop_noise: 983 assert(!"not reached: should be handled by lower_noise"); 984 break; 985 986 case ir_quadop_vector: 987 assert(!"not reached: should be handled by lower_quadop_vector"); 988 break; 989 990 case ir_unop_sqrt: 991 emit_math(FS_OPCODE_SQRT, this->result, op[0]); 992 break; 993 994 case ir_unop_rsq: 995 emit_math(FS_OPCODE_RSQ, this->result, op[0]); 996 break; 997 998 case ir_unop_i2f: 999 case ir_unop_b2f: 1000 case ir_unop_b2i: 1001 case ir_unop_f2i: 1002 emit(fs_inst(BRW_OPCODE_MOV, this->result, op[0])); 1003 break; 1004 case ir_unop_f2b: 1005 case ir_unop_i2b: 1006 temp = this->result; 1007 /* original gen4 does implicit conversion before comparison. */ 1008 if (intel->gen < 5) 1009 temp.type = op[0].type; 1010 1011 inst = emit(fs_inst(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f))); 1012 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1013 inst = emit(fs_inst(BRW_OPCODE_AND, this->result, 1014 this->result, fs_reg(1))); 1015 break; 1016 1017 case ir_unop_trunc: 1018 emit(fs_inst(BRW_OPCODE_RNDZ, this->result, op[0])); 1019 break; 1020 case ir_unop_ceil: 1021 op[0].negate = !op[0].negate; 1022 inst = emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0])); 1023 this->result.negate = true; 1024 break; 1025 case ir_unop_floor: 1026 inst = emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0])); 1027 break; 1028 case ir_unop_fract: 1029 inst = emit(fs_inst(BRW_OPCODE_FRC, this->result, op[0])); 1030 break; 1031 case ir_unop_round_even: 1032 emit(fs_inst(BRW_OPCODE_RNDE, this->result, op[0])); 1033 break; 1034 1035 case ir_binop_min: 1036 inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1])); 1037 inst->conditional_mod = BRW_CONDITIONAL_L; 1038 1039 inst = emit(fs_inst(BRW_OPCODE_SEL, this->result, op[0], op[1])); 1040 inst->predicated = true; 1041 break; 1042 case ir_binop_max: 1043 inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1])); 1044 inst->conditional_mod = BRW_CONDITIONAL_G; 1045 1046 inst = emit(fs_inst(BRW_OPCODE_SEL, this->result, op[0], op[1])); 1047 inst->predicated = true; 1048 break; 1049 1050 case ir_binop_pow: 1051 emit_math(FS_OPCODE_POW, this->result, op[0], op[1]); 1052 break; 1053 1054 case ir_unop_bit_not: 1055 inst = emit(fs_inst(BRW_OPCODE_NOT, this->result, op[0])); 1056 break; 1057 case ir_binop_bit_and: 1058 inst = emit(fs_inst(BRW_OPCODE_AND, this->result, op[0], op[1])); 1059 break; 1060 case ir_binop_bit_xor: 1061 inst = emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], op[1])); 1062 break; 1063 case ir_binop_bit_or: 1064 inst = emit(fs_inst(BRW_OPCODE_OR, this->result, op[0], op[1])); 1065 break; 1066 1067 case ir_unop_u2f: 1068 case ir_binop_lshift: 1069 case ir_binop_rshift: 1070 assert(!"GLSL 1.30 features unsupported"); 1071 break; 1072 } 1073} 1074 1075void 1076fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r, 1077 const glsl_type *type, bool predicated) 1078{ 1079 switch (type->base_type) { 1080 case GLSL_TYPE_FLOAT: 1081 case GLSL_TYPE_UINT: 1082 case GLSL_TYPE_INT: 1083 case GLSL_TYPE_BOOL: 1084 for (unsigned int i = 0; i < type->components(); i++) { 1085 l.type = brw_type_for_base_type(type); 1086 r.type = brw_type_for_base_type(type); 1087 1088 fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, l, r)); 1089 inst->predicated = predicated; 1090 1091 l.reg_offset++; 1092 r.reg_offset++; 1093 } 1094 break; 1095 case GLSL_TYPE_ARRAY: 1096 for (unsigned int i = 0; i < type->length; i++) { 1097 emit_assignment_writes(l, r, type->fields.array, predicated); 1098 } 1099 break; 1100 1101 case GLSL_TYPE_STRUCT: 1102 for (unsigned int i = 0; i < type->length; i++) { 1103 emit_assignment_writes(l, r, type->fields.structure[i].type, 1104 predicated); 1105 } 1106 break; 1107 1108 case GLSL_TYPE_SAMPLER: 1109 break; 1110 1111 default: 1112 assert(!"not reached"); 1113 break; 1114 } 1115} 1116 1117void 1118fs_visitor::visit(ir_assignment *ir) 1119{ 1120 struct fs_reg l, r; 1121 fs_inst *inst; 1122 1123 /* FINISHME: arrays on the lhs */ 1124 ir->lhs->accept(this); 1125 l = this->result; 1126 1127 ir->rhs->accept(this); 1128 r = this->result; 1129 1130 assert(l.file != BAD_FILE); 1131 assert(r.file != BAD_FILE); 1132 1133 if (ir->condition) { 1134 emit_bool_to_cond_code(ir->condition); 1135 } 1136 1137 if (ir->lhs->type->is_scalar() || 1138 ir->lhs->type->is_vector()) { 1139 for (int i = 0; i < ir->lhs->type->vector_elements; i++) { 1140 if (ir->write_mask & (1 << i)) { 1141 inst = emit(fs_inst(BRW_OPCODE_MOV, l, r)); 1142 if (ir->condition) 1143 inst->predicated = true; 1144 r.reg_offset++; 1145 } 1146 l.reg_offset++; 1147 } 1148 } else { 1149 emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL); 1150 } 1151} 1152 1153fs_inst * 1154fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate) 1155{ 1156 int mlen; 1157 int base_mrf = 1; 1158 bool simd16 = false; 1159 fs_reg orig_dst; 1160 1161 /* g0 header. */ 1162 mlen = 1; 1163 1164 if (ir->shadow_comparitor) { 1165 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1166 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), 1167 coordinate)); 1168 coordinate.reg_offset++; 1169 } 1170 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */ 1171 mlen += 3; 1172 1173 if (ir->op == ir_tex) { 1174 /* There's no plain shadow compare message, so we use shadow 1175 * compare with a bias of 0.0. 1176 */ 1177 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), 1178 fs_reg(0.0f))); 1179 mlen++; 1180 } else if (ir->op == ir_txb) { 1181 ir->lod_info.bias->accept(this); 1182 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), 1183 this->result)); 1184 mlen++; 1185 } else { 1186 assert(ir->op == ir_txl); 1187 ir->lod_info.lod->accept(this); 1188 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), 1189 this->result)); 1190 mlen++; 1191 } 1192 1193 ir->shadow_comparitor->accept(this); 1194 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result)); 1195 mlen++; 1196 } else if (ir->op == ir_tex) { 1197 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1198 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), 1199 coordinate)); 1200 coordinate.reg_offset++; 1201 } 1202 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */ 1203 mlen += 3; 1204 } else if (ir->op == ir_txd) { 1205 assert(!"TXD isn't supported on gen4 yet."); 1206 } else { 1207 /* Oh joy. gen4 doesn't have SIMD8 non-shadow-compare bias/lod 1208 * instructions. We'll need to do SIMD16 here. 1209 */ 1210 assert(ir->op == ir_txb || ir->op == ir_txl); 1211 1212 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1213 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2), 1214 coordinate)); 1215 coordinate.reg_offset++; 1216 } 1217 1218 /* lod/bias appears after u/v/r. */ 1219 mlen += 6; 1220 1221 if (ir->op == ir_txb) { 1222 ir->lod_info.bias->accept(this); 1223 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), 1224 this->result)); 1225 mlen++; 1226 } else { 1227 ir->lod_info.lod->accept(this); 1228 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), 1229 this->result)); 1230 mlen++; 1231 } 1232 1233 /* The unused upper half. */ 1234 mlen++; 1235 1236 /* Now, since we're doing simd16, the return is 2 interleaved 1237 * vec4s where the odd-indexed ones are junk. We'll need to move 1238 * this weirdness around to the expected layout. 1239 */ 1240 simd16 = true; 1241 orig_dst = dst; 1242 dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 1243 2)); 1244 dst.type = BRW_REGISTER_TYPE_F; 1245 } 1246 1247 fs_inst *inst = NULL; 1248 switch (ir->op) { 1249 case ir_tex: 1250 inst = emit(fs_inst(FS_OPCODE_TEX, dst)); 1251 break; 1252 case ir_txb: 1253 inst = emit(fs_inst(FS_OPCODE_TXB, dst)); 1254 break; 1255 case ir_txl: 1256 inst = emit(fs_inst(FS_OPCODE_TXL, dst)); 1257 break; 1258 case ir_txd: 1259 inst = emit(fs_inst(FS_OPCODE_TXD, dst)); 1260 break; 1261 case ir_txf: 1262 assert(!"GLSL 1.30 features unsupported"); 1263 break; 1264 } 1265 inst->base_mrf = base_mrf; 1266 inst->mlen = mlen; 1267 1268 if (simd16) { 1269 for (int i = 0; i < 4; i++) { 1270 emit(fs_inst(BRW_OPCODE_MOV, orig_dst, dst)); 1271 orig_dst.reg_offset++; 1272 dst.reg_offset += 2; 1273 } 1274 } 1275 1276 return inst; 1277} 1278 1279fs_inst * 1280fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate) 1281{ 1282 /* gen5's SIMD8 sampler has slots for u, v, r, array index, then 1283 * optional parameters like shadow comparitor or LOD bias. If 1284 * optional parameters aren't present, those base slots are 1285 * optional and don't need to be included in the message. 1286 * 1287 * We don't fill in the unnecessary slots regardless, which may 1288 * look surprising in the disassembly. 1289 */ 1290 int mlen = 1; /* g0 header always present. */ 1291 int base_mrf = 1; 1292 1293 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1294 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), 1295 coordinate)); 1296 coordinate.reg_offset++; 1297 } 1298 mlen += ir->coordinate->type->vector_elements; 1299 1300 if (ir->shadow_comparitor) { 1301 mlen = MAX2(mlen, 5); 1302 1303 ir->shadow_comparitor->accept(this); 1304 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result)); 1305 mlen++; 1306 } 1307 1308 fs_inst *inst = NULL; 1309 switch (ir->op) { 1310 case ir_tex: 1311 inst = emit(fs_inst(FS_OPCODE_TEX, dst)); 1312 break; 1313 case ir_txb: 1314 ir->lod_info.bias->accept(this); 1315 mlen = MAX2(mlen, 5); 1316 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result)); 1317 mlen++; 1318 1319 inst = emit(fs_inst(FS_OPCODE_TXB, dst)); 1320 break; 1321 case ir_txl: 1322 ir->lod_info.lod->accept(this); 1323 mlen = MAX2(mlen, 5); 1324 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result)); 1325 mlen++; 1326 1327 inst = emit(fs_inst(FS_OPCODE_TXL, dst)); 1328 break; 1329 case ir_txd: 1330 case ir_txf: 1331 assert(!"GLSL 1.30 features unsupported"); 1332 break; 1333 } 1334 inst->base_mrf = base_mrf; 1335 inst->mlen = mlen; 1336 1337 return inst; 1338} 1339 1340void 1341fs_visitor::visit(ir_texture *ir) 1342{ 1343 int sampler; 1344 fs_inst *inst = NULL; 1345 1346 ir->coordinate->accept(this); 1347 fs_reg coordinate = this->result; 1348 1349 if (ir->offset != NULL) { 1350 ir_constant *offset = ir->offset->as_constant(); 1351 assert(offset != NULL); 1352 1353 signed char offsets[3]; 1354 for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) 1355 offsets[i] = (signed char) offset->value.i[i]; 1356 1357 /* Combine all three offsets into a single unsigned dword: 1358 * 1359 * bits 11:8 - U Offset (X component) 1360 * bits 7:4 - V Offset (Y component) 1361 * bits 3:0 - R Offset (Z component) 1362 */ 1363 unsigned offset_bits = 0; 1364 for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) { 1365 const unsigned shift = 4 * (2 - i); 1366 offset_bits |= (offsets[i] << shift) & (0xF << shift); 1367 } 1368 1369 /* Explicitly set up the message header by copying g0 to msg reg m1. */ 1370 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, 1, BRW_REGISTER_TYPE_UD), 1371 fs_reg(GRF, 0, BRW_REGISTER_TYPE_UD))); 1372 1373 /* Then set the offset bits in DWord 2 of the message header. */ 1374 emit(fs_inst(BRW_OPCODE_MOV, 1375 fs_reg(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 1, 2), 1376 BRW_REGISTER_TYPE_UD)), 1377 fs_reg(brw_imm_uw(offset_bits)))); 1378 } 1379 1380 /* Should be lowered by do_lower_texture_projection */ 1381 assert(!ir->projector); 1382 1383 sampler = _mesa_get_sampler_uniform_value(ir->sampler, 1384 ctx->Shader.CurrentFragmentProgram, 1385 &brw->fragment_program->Base); 1386 sampler = c->fp->program.Base.SamplerUnits[sampler]; 1387 1388 /* The 965 requires the EU to do the normalization of GL rectangle 1389 * texture coordinates. We use the program parameter state 1390 * tracking to get the scaling factor. 1391 */ 1392 if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) { 1393 struct gl_program_parameter_list *params = c->fp->program.Base.Parameters; 1394 int tokens[STATE_LENGTH] = { 1395 STATE_INTERNAL, 1396 STATE_TEXRECT_SCALE, 1397 sampler, 1398 0, 1399 0 1400 }; 1401 1402 c->prog_data.param_convert[c->prog_data.nr_params] = 1403 PARAM_NO_CONVERT; 1404 c->prog_data.param_convert[c->prog_data.nr_params + 1] = 1405 PARAM_NO_CONVERT; 1406 1407 fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params); 1408 fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1); 1409 GLuint index = _mesa_add_state_reference(params, 1410 (gl_state_index *)tokens); 1411 1412 this->param_index[c->prog_data.nr_params] = index; 1413 this->param_offset[c->prog_data.nr_params] = 0; 1414 c->prog_data.nr_params++; 1415 this->param_index[c->prog_data.nr_params] = index; 1416 this->param_offset[c->prog_data.nr_params] = 1; 1417 c->prog_data.nr_params++; 1418 1419 fs_reg dst = fs_reg(this, ir->coordinate->type); 1420 fs_reg src = coordinate; 1421 coordinate = dst; 1422 1423 emit(fs_inst(BRW_OPCODE_MUL, dst, src, scale_x)); 1424 dst.reg_offset++; 1425 src.reg_offset++; 1426 emit(fs_inst(BRW_OPCODE_MUL, dst, src, scale_y)); 1427 } 1428 1429 /* Writemasking doesn't eliminate channels on SIMD8 texture 1430 * samples, so don't worry about them. 1431 */ 1432 fs_reg dst = fs_reg(this, glsl_type::vec4_type); 1433 1434 if (intel->gen < 5) { 1435 inst = emit_texture_gen4(ir, dst, coordinate); 1436 } else { 1437 inst = emit_texture_gen5(ir, dst, coordinate); 1438 } 1439 1440 /* If there's an offset, we already set up m1. To avoid the implied move, 1441 * use the null register. Otherwise, we want an implied move from g0. 1442 */ 1443 if (ir->offset != NULL) 1444 inst->src[0] = fs_reg(brw_null_reg()); 1445 else 1446 inst->src[0] = fs_reg(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW)); 1447 1448 inst->sampler = sampler; 1449 1450 this->result = dst; 1451 1452 if (ir->shadow_comparitor) 1453 inst->shadow_compare = true; 1454 1455 if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) { 1456 fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type); 1457 1458 for (int i = 0; i < 4; i++) { 1459 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i); 1460 fs_reg l = swizzle_dst; 1461 l.reg_offset += i; 1462 1463 if (swiz == SWIZZLE_ZERO) { 1464 emit(fs_inst(BRW_OPCODE_MOV, l, fs_reg(0.0f))); 1465 } else if (swiz == SWIZZLE_ONE) { 1466 emit(fs_inst(BRW_OPCODE_MOV, l, fs_reg(1.0f))); 1467 } else { 1468 fs_reg r = dst; 1469 r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i); 1470 emit(fs_inst(BRW_OPCODE_MOV, l, r)); 1471 } 1472 } 1473 this->result = swizzle_dst; 1474 } 1475} 1476 1477void 1478fs_visitor::visit(ir_swizzle *ir) 1479{ 1480 ir->val->accept(this); 1481 fs_reg val = this->result; 1482 1483 if (ir->type->vector_elements == 1) { 1484 this->result.reg_offset += ir->mask.x; 1485 return; 1486 } 1487 1488 fs_reg result = fs_reg(this, ir->type); 1489 this->result = result; 1490 1491 for (unsigned int i = 0; i < ir->type->vector_elements; i++) { 1492 fs_reg channel = val; 1493 int swiz = 0; 1494 1495 switch (i) { 1496 case 0: 1497 swiz = ir->mask.x; 1498 break; 1499 case 1: 1500 swiz = ir->mask.y; 1501 break; 1502 case 2: 1503 swiz = ir->mask.z; 1504 break; 1505 case 3: 1506 swiz = ir->mask.w; 1507 break; 1508 } 1509 1510 channel.reg_offset += swiz; 1511 emit(fs_inst(BRW_OPCODE_MOV, result, channel)); 1512 result.reg_offset++; 1513 } 1514} 1515 1516void 1517fs_visitor::visit(ir_discard *ir) 1518{ 1519 fs_reg temp = fs_reg(this, glsl_type::uint_type); 1520 1521 assert(ir->condition == NULL); /* FINISHME */ 1522 1523 emit(fs_inst(FS_OPCODE_DISCARD_NOT, temp, reg_null_d)); 1524 emit(fs_inst(FS_OPCODE_DISCARD_AND, reg_null_d, temp)); 1525 kill_emitted = true; 1526} 1527 1528void 1529fs_visitor::visit(ir_constant *ir) 1530{ 1531 /* Set this->result to reg at the bottom of the function because some code 1532 * paths will cause this visitor to be applied to other fields. This will 1533 * cause the value stored in this->result to be modified. 1534 * 1535 * Make reg constant so that it doesn't get accidentally modified along the 1536 * way. Yes, I actually had this problem. :( 1537 */ 1538 const fs_reg reg(this, ir->type); 1539 fs_reg dst_reg = reg; 1540 1541 if (ir->type->is_array()) { 1542 const unsigned size = type_size(ir->type->fields.array); 1543 1544 for (unsigned i = 0; i < ir->type->length; i++) { 1545 ir->array_elements[i]->accept(this); 1546 fs_reg src_reg = this->result; 1547 1548 dst_reg.type = src_reg.type; 1549 for (unsigned j = 0; j < size; j++) { 1550 emit(fs_inst(BRW_OPCODE_MOV, dst_reg, src_reg)); 1551 src_reg.reg_offset++; 1552 dst_reg.reg_offset++; 1553 } 1554 } 1555 } else if (ir->type->is_record()) { 1556 foreach_list(node, &ir->components) { 1557 ir_instruction *const field = (ir_instruction *) node; 1558 const unsigned size = type_size(field->type); 1559 1560 field->accept(this); 1561 fs_reg src_reg = this->result; 1562 1563 dst_reg.type = src_reg.type; 1564 for (unsigned j = 0; j < size; j++) { 1565 emit(fs_inst(BRW_OPCODE_MOV, dst_reg, src_reg)); 1566 src_reg.reg_offset++; 1567 dst_reg.reg_offset++; 1568 } 1569 } 1570 } else { 1571 const unsigned size = type_size(ir->type); 1572 1573 for (unsigned i = 0; i < size; i++) { 1574 switch (ir->type->base_type) { 1575 case GLSL_TYPE_FLOAT: 1576 emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i]))); 1577 break; 1578 case GLSL_TYPE_UINT: 1579 emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i]))); 1580 break; 1581 case GLSL_TYPE_INT: 1582 emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i]))); 1583 break; 1584 case GLSL_TYPE_BOOL: 1585 emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i]))); 1586 break; 1587 default: 1588 assert(!"Non-float/uint/int/bool constant"); 1589 } 1590 dst_reg.reg_offset++; 1591 } 1592 } 1593 1594 this->result = reg; 1595} 1596 1597void 1598fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir) 1599{ 1600 ir_expression *expr = ir->as_expression(); 1601 1602 if (expr) { 1603 fs_reg op[2]; 1604 fs_inst *inst; 1605 1606 assert(expr->get_num_operands() <= 2); 1607 for (unsigned int i = 0; i < expr->get_num_operands(); i++) { 1608 assert(expr->operands[i]->type->is_scalar()); 1609 1610 expr->operands[i]->accept(this); 1611 op[i] = this->result; 1612 } 1613 1614 switch (expr->operation) { 1615 case ir_unop_logic_not: 1616 inst = emit(fs_inst(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1))); 1617 inst->conditional_mod = BRW_CONDITIONAL_Z; 1618 break; 1619 1620 case ir_binop_logic_xor: 1621 inst = emit(fs_inst(BRW_OPCODE_XOR, reg_null_d, op[0], op[1])); 1622 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1623 break; 1624 1625 case ir_binop_logic_or: 1626 inst = emit(fs_inst(BRW_OPCODE_OR, reg_null_d, op[0], op[1])); 1627 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1628 break; 1629 1630 case ir_binop_logic_and: 1631 inst = emit(fs_inst(BRW_OPCODE_AND, reg_null_d, op[0], op[1])); 1632 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1633 break; 1634 1635 case ir_unop_f2b: 1636 if (intel->gen >= 6) { 1637 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, 1638 op[0], fs_reg(0.0f))); 1639 } else { 1640 inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_f, op[0])); 1641 } 1642 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1643 break; 1644 1645 case ir_unop_i2b: 1646 if (intel->gen >= 6) { 1647 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0))); 1648 } else { 1649 inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_d, op[0])); 1650 } 1651 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1652 break; 1653 1654 case ir_binop_greater: 1655 case ir_binop_gequal: 1656 case ir_binop_less: 1657 case ir_binop_lequal: 1658 case ir_binop_equal: 1659 case ir_binop_all_equal: 1660 case ir_binop_nequal: 1661 case ir_binop_any_nequal: 1662 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1])); 1663 inst->conditional_mod = 1664 brw_conditional_for_comparison(expr->operation); 1665 break; 1666 1667 default: 1668 assert(!"not reached"); 1669 this->fail = true; 1670 break; 1671 } 1672 return; 1673 } 1674 1675 ir->accept(this); 1676 1677 if (intel->gen >= 6) { 1678 fs_inst *inst = emit(fs_inst(BRW_OPCODE_AND, reg_null_d, 1679 this->result, fs_reg(1))); 1680 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1681 } else { 1682 fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_d, this->result)); 1683 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1684 } 1685} 1686 1687/** 1688 * Emit a gen6 IF statement with the comparison folded into the IF 1689 * instruction. 1690 */ 1691void 1692fs_visitor::emit_if_gen6(ir_if *ir) 1693{ 1694 ir_expression *expr = ir->condition->as_expression(); 1695 1696 if (expr) { 1697 fs_reg op[2]; 1698 fs_inst *inst; 1699 fs_reg temp; 1700 1701 assert(expr->get_num_operands() <= 2); 1702 for (unsigned int i = 0; i < expr->get_num_operands(); i++) { 1703 assert(expr->operands[i]->type->is_scalar()); 1704 1705 expr->operands[i]->accept(this); 1706 op[i] = this->result; 1707 } 1708 1709 switch (expr->operation) { 1710 case ir_unop_logic_not: 1711 inst = emit(fs_inst(BRW_OPCODE_IF, temp, op[0], fs_reg(0))); 1712 inst->conditional_mod = BRW_CONDITIONAL_Z; 1713 return; 1714 1715 case ir_binop_logic_xor: 1716 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1])); 1717 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1718 return; 1719 1720 case ir_binop_logic_or: 1721 temp = fs_reg(this, glsl_type::bool_type); 1722 emit(fs_inst(BRW_OPCODE_OR, temp, op[0], op[1])); 1723 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0))); 1724 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1725 return; 1726 1727 case ir_binop_logic_and: 1728 temp = fs_reg(this, glsl_type::bool_type); 1729 emit(fs_inst(BRW_OPCODE_AND, temp, op[0], op[1])); 1730 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0))); 1731 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1732 return; 1733 1734 case ir_unop_f2b: 1735 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0))); 1736 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1737 return; 1738 1739 case ir_unop_i2b: 1740 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0))); 1741 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1742 return; 1743 1744 case ir_binop_greater: 1745 case ir_binop_gequal: 1746 case ir_binop_less: 1747 case ir_binop_lequal: 1748 case ir_binop_equal: 1749 case ir_binop_all_equal: 1750 case ir_binop_nequal: 1751 case ir_binop_any_nequal: 1752 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1])); 1753 inst->conditional_mod = 1754 brw_conditional_for_comparison(expr->operation); 1755 return; 1756 default: 1757 assert(!"not reached"); 1758 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0))); 1759 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1760 this->fail = true; 1761 return; 1762 } 1763 return; 1764 } 1765 1766 ir->condition->accept(this); 1767 1768 fs_inst *inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0))); 1769 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1770} 1771 1772void 1773fs_visitor::visit(ir_if *ir) 1774{ 1775 fs_inst *inst; 1776 1777 /* Don't point the annotation at the if statement, because then it plus 1778 * the then and else blocks get printed. 1779 */ 1780 this->base_ir = ir->condition; 1781 1782 if (intel->gen >= 6) { 1783 emit_if_gen6(ir); 1784 } else { 1785 emit_bool_to_cond_code(ir->condition); 1786 1787 inst = emit(fs_inst(BRW_OPCODE_IF)); 1788 inst->predicated = true; 1789 } 1790 1791 foreach_iter(exec_list_iterator, iter, ir->then_instructions) { 1792 ir_instruction *ir = (ir_instruction *)iter.get(); 1793 this->base_ir = ir; 1794 1795 ir->accept(this); 1796 } 1797 1798 if (!ir->else_instructions.is_empty()) { 1799 emit(fs_inst(BRW_OPCODE_ELSE)); 1800 1801 foreach_iter(exec_list_iterator, iter, ir->else_instructions) { 1802 ir_instruction *ir = (ir_instruction *)iter.get(); 1803 this->base_ir = ir; 1804 1805 ir->accept(this); 1806 } 1807 } 1808 1809 emit(fs_inst(BRW_OPCODE_ENDIF)); 1810} 1811 1812void 1813fs_visitor::visit(ir_loop *ir) 1814{ 1815 fs_reg counter = reg_undef; 1816 1817 if (ir->counter) { 1818 this->base_ir = ir->counter; 1819 ir->counter->accept(this); 1820 counter = *(variable_storage(ir->counter)); 1821 1822 if (ir->from) { 1823 this->base_ir = ir->from; 1824 ir->from->accept(this); 1825 1826 emit(fs_inst(BRW_OPCODE_MOV, counter, this->result)); 1827 } 1828 } 1829 1830 emit(fs_inst(BRW_OPCODE_DO)); 1831 1832 if (ir->to) { 1833 this->base_ir = ir->to; 1834 ir->to->accept(this); 1835 1836 fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_cmp, 1837 counter, this->result)); 1838 inst->conditional_mod = brw_conditional_for_comparison(ir->cmp); 1839 1840 inst = emit(fs_inst(BRW_OPCODE_BREAK)); 1841 inst->predicated = true; 1842 } 1843 1844 foreach_iter(exec_list_iterator, iter, ir->body_instructions) { 1845 ir_instruction *ir = (ir_instruction *)iter.get(); 1846 1847 this->base_ir = ir; 1848 ir->accept(this); 1849 } 1850 1851 if (ir->increment) { 1852 this->base_ir = ir->increment; 1853 ir->increment->accept(this); 1854 emit(fs_inst(BRW_OPCODE_ADD, counter, counter, this->result)); 1855 } 1856 1857 emit(fs_inst(BRW_OPCODE_WHILE)); 1858} 1859 1860void 1861fs_visitor::visit(ir_loop_jump *ir) 1862{ 1863 switch (ir->mode) { 1864 case ir_loop_jump::jump_break: 1865 emit(fs_inst(BRW_OPCODE_BREAK)); 1866 break; 1867 case ir_loop_jump::jump_continue: 1868 emit(fs_inst(BRW_OPCODE_CONTINUE)); 1869 break; 1870 } 1871} 1872 1873void 1874fs_visitor::visit(ir_call *ir) 1875{ 1876 assert(!"FINISHME"); 1877} 1878 1879void 1880fs_visitor::visit(ir_return *ir) 1881{ 1882 assert(!"FINISHME"); 1883} 1884 1885void 1886fs_visitor::visit(ir_function *ir) 1887{ 1888 /* Ignore function bodies other than main() -- we shouldn't see calls to 1889 * them since they should all be inlined before we get to ir_to_mesa. 1890 */ 1891 if (strcmp(ir->name, "main") == 0) { 1892 const ir_function_signature *sig; 1893 exec_list empty; 1894 1895 sig = ir->matching_signature(&empty); 1896 1897 assert(sig); 1898 1899 foreach_iter(exec_list_iterator, iter, sig->body) { 1900 ir_instruction *ir = (ir_instruction *)iter.get(); 1901 this->base_ir = ir; 1902 1903 ir->accept(this); 1904 } 1905 } 1906} 1907 1908void 1909fs_visitor::visit(ir_function_signature *ir) 1910{ 1911 assert(!"not reached"); 1912 (void)ir; 1913} 1914 1915fs_inst * 1916fs_visitor::emit(fs_inst inst) 1917{ 1918 fs_inst *list_inst = new(mem_ctx) fs_inst; 1919 *list_inst = inst; 1920 1921 list_inst->annotation = this->current_annotation; 1922 list_inst->ir = this->base_ir; 1923 1924 this->instructions.push_tail(list_inst); 1925 1926 return list_inst; 1927} 1928 1929/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */ 1930void 1931fs_visitor::emit_dummy_fs() 1932{ 1933 /* Everyone's favorite color. */ 1934 emit(fs_inst(BRW_OPCODE_MOV, 1935 fs_reg(MRF, 2), 1936 fs_reg(1.0f))); 1937 emit(fs_inst(BRW_OPCODE_MOV, 1938 fs_reg(MRF, 3), 1939 fs_reg(0.0f))); 1940 emit(fs_inst(BRW_OPCODE_MOV, 1941 fs_reg(MRF, 4), 1942 fs_reg(1.0f))); 1943 emit(fs_inst(BRW_OPCODE_MOV, 1944 fs_reg(MRF, 5), 1945 fs_reg(0.0f))); 1946 1947 fs_inst *write; 1948 write = emit(fs_inst(FS_OPCODE_FB_WRITE, 1949 fs_reg(0), 1950 fs_reg(0))); 1951 write->base_mrf = 0; 1952} 1953 1954/* The register location here is relative to the start of the URB 1955 * data. It will get adjusted to be a real location before 1956 * generate_code() time. 1957 */ 1958struct brw_reg 1959fs_visitor::interp_reg(int location, int channel) 1960{ 1961 int regnr = urb_setup[location] * 2 + channel / 2; 1962 int stride = (channel & 1) * 4; 1963 1964 assert(urb_setup[location] != -1); 1965 1966 return brw_vec1_grf(regnr, stride); 1967} 1968 1969/** Emits the interpolation for the varying inputs. */ 1970void 1971fs_visitor::emit_interpolation_setup_gen4() 1972{ 1973 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW); 1974 1975 this->current_annotation = "compute pixel centers"; 1976 this->pixel_x = fs_reg(this, glsl_type::uint_type); 1977 this->pixel_y = fs_reg(this, glsl_type::uint_type); 1978 this->pixel_x.type = BRW_REGISTER_TYPE_UW; 1979 this->pixel_y.type = BRW_REGISTER_TYPE_UW; 1980 emit(fs_inst(BRW_OPCODE_ADD, 1981 this->pixel_x, 1982 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)), 1983 fs_reg(brw_imm_v(0x10101010)))); 1984 emit(fs_inst(BRW_OPCODE_ADD, 1985 this->pixel_y, 1986 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)), 1987 fs_reg(brw_imm_v(0x11001100)))); 1988 1989 this->current_annotation = "compute pixel deltas from v0"; 1990 if (brw->has_pln) { 1991 this->delta_x = fs_reg(this, glsl_type::vec2_type); 1992 this->delta_y = this->delta_x; 1993 this->delta_y.reg_offset++; 1994 } else { 1995 this->delta_x = fs_reg(this, glsl_type::float_type); 1996 this->delta_y = fs_reg(this, glsl_type::float_type); 1997 } 1998 emit(fs_inst(BRW_OPCODE_ADD, 1999 this->delta_x, 2000 this->pixel_x, 2001 fs_reg(negate(brw_vec1_grf(1, 0))))); 2002 emit(fs_inst(BRW_OPCODE_ADD, 2003 this->delta_y, 2004 this->pixel_y, 2005 fs_reg(negate(brw_vec1_grf(1, 1))))); 2006 2007 this->current_annotation = "compute pos.w and 1/pos.w"; 2008 /* Compute wpos.w. It's always in our setup, since it's needed to 2009 * interpolate the other attributes. 2010 */ 2011 this->wpos_w = fs_reg(this, glsl_type::float_type); 2012 emit(fs_inst(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y, 2013 interp_reg(FRAG_ATTRIB_WPOS, 3))); 2014 /* Compute the pixel 1/W value from wpos.w. */ 2015 this->pixel_w = fs_reg(this, glsl_type::float_type); 2016 emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w); 2017 this->current_annotation = NULL; 2018} 2019 2020/** Emits the interpolation for the varying inputs. */ 2021void 2022fs_visitor::emit_interpolation_setup_gen6() 2023{ 2024 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW); 2025 2026 /* If the pixel centers end up used, the setup is the same as for gen4. */ 2027 this->current_annotation = "compute pixel centers"; 2028 fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type); 2029 fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type); 2030 int_pixel_x.type = BRW_REGISTER_TYPE_UW; 2031 int_pixel_y.type = BRW_REGISTER_TYPE_UW; 2032 emit(fs_inst(BRW_OPCODE_ADD, 2033 int_pixel_x, 2034 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)), 2035 fs_reg(brw_imm_v(0x10101010)))); 2036 emit(fs_inst(BRW_OPCODE_ADD, 2037 int_pixel_y, 2038 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)), 2039 fs_reg(brw_imm_v(0x11001100)))); 2040 2041 /* As of gen6, we can no longer mix float and int sources. We have 2042 * to turn the integer pixel centers into floats for their actual 2043 * use. 2044 */ 2045 this->pixel_x = fs_reg(this, glsl_type::float_type); 2046 this->pixel_y = fs_reg(this, glsl_type::float_type); 2047 emit(fs_inst(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x)); 2048 emit(fs_inst(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y)); 2049 2050 this->current_annotation = "compute 1/pos.w"; 2051 this->wpos_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0)); 2052 this->pixel_w = fs_reg(this, glsl_type::float_type); 2053 emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w); 2054 2055 this->delta_x = fs_reg(brw_vec8_grf(2, 0)); 2056 this->delta_y = fs_reg(brw_vec8_grf(3, 0)); 2057 2058 this->current_annotation = NULL; 2059} 2060 2061void 2062fs_visitor::emit_fb_writes() 2063{ 2064 this->current_annotation = "FB write header"; 2065 GLboolean header_present = GL_TRUE; 2066 int nr = 0; 2067 2068 if (intel->gen >= 6 && 2069 !this->kill_emitted && 2070 c->key.nr_color_regions == 1) { 2071 header_present = false; 2072 } 2073 2074 if (header_present) { 2075 /* m0, m1 header */ 2076 nr += 2; 2077 } 2078 2079 if (c->aa_dest_stencil_reg) { 2080 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++), 2081 fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)))); 2082 } 2083 2084 /* Reserve space for color. It'll be filled in per MRT below. */ 2085 int color_mrf = nr; 2086 nr += 4; 2087 2088 if (c->source_depth_to_render_target) { 2089 if (c->computes_depth) { 2090 /* Hand over gl_FragDepth. */ 2091 assert(this->frag_depth); 2092 fs_reg depth = *(variable_storage(this->frag_depth)); 2093 2094 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth)); 2095 } else { 2096 /* Pass through the payload depth. */ 2097 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++), 2098 fs_reg(brw_vec8_grf(c->source_depth_reg, 0)))); 2099 } 2100 } 2101 2102 if (c->dest_depth_reg) { 2103 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++), 2104 fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)))); 2105 } 2106 2107 fs_reg color = reg_undef; 2108 if (this->frag_color) 2109 color = *(variable_storage(this->frag_color)); 2110 else if (this->frag_data) { 2111 color = *(variable_storage(this->frag_data)); 2112 color.type = BRW_REGISTER_TYPE_F; 2113 } 2114 2115 for (int target = 0; target < c->key.nr_color_regions; target++) { 2116 this->current_annotation = ralloc_asprintf(this->mem_ctx, 2117 "FB write target %d", 2118 target); 2119 if (this->frag_color || this->frag_data) { 2120 for (int i = 0; i < 4; i++) { 2121 emit(fs_inst(BRW_OPCODE_MOV, 2122 fs_reg(MRF, color_mrf + i), 2123 color)); 2124 color.reg_offset++; 2125 } 2126 } 2127 2128 if (this->frag_color) 2129 color.reg_offset -= 4; 2130 2131 fs_inst *inst = emit(fs_inst(FS_OPCODE_FB_WRITE, 2132 reg_undef, reg_undef)); 2133 inst->target = target; 2134 inst->base_mrf = 0; 2135 inst->mlen = nr; 2136 if (target == c->key.nr_color_regions - 1) 2137 inst->eot = true; 2138 inst->header_present = header_present; 2139 } 2140 2141 if (c->key.nr_color_regions == 0) { 2142 fs_inst *inst = emit(fs_inst(FS_OPCODE_FB_WRITE, 2143 reg_undef, reg_undef)); 2144 inst->base_mrf = 0; 2145 inst->mlen = nr; 2146 inst->eot = true; 2147 inst->header_present = header_present; 2148 } 2149 2150 this->current_annotation = NULL; 2151} 2152 2153void 2154fs_visitor::generate_fb_write(fs_inst *inst) 2155{ 2156 GLboolean eot = inst->eot; 2157 struct brw_reg implied_header; 2158 2159 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied 2160 * move, here's g1. 2161 */ 2162 brw_push_insn_state(p); 2163 brw_set_mask_control(p, BRW_MASK_DISABLE); 2164 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2165 2166 if (inst->header_present) { 2167 if (intel->gen >= 6) { 2168 brw_MOV(p, 2169 brw_message_reg(inst->base_mrf), 2170 brw_vec8_grf(0, 0)); 2171 2172 if (inst->target > 0) { 2173 /* Set the render target index for choosing BLEND_STATE. */ 2174 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2), 2175 BRW_REGISTER_TYPE_UD), 2176 brw_imm_ud(inst->target)); 2177 } 2178 2179 /* Clear viewport index, render target array index. */ 2180 brw_AND(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 0), 2181 BRW_REGISTER_TYPE_UD), 2182 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), 2183 brw_imm_ud(0xf7ff)); 2184 2185 implied_header = brw_null_reg(); 2186 } else { 2187 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); 2188 } 2189 2190 brw_MOV(p, 2191 brw_message_reg(inst->base_mrf + 1), 2192 brw_vec8_grf(1, 0)); 2193 } else { 2194 implied_header = brw_null_reg(); 2195 } 2196 2197 brw_pop_insn_state(p); 2198 2199 brw_fb_WRITE(p, 2200 8, /* dispatch_width */ 2201 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW), 2202 inst->base_mrf, 2203 implied_header, 2204 inst->target, 2205 inst->mlen, 2206 0, 2207 eot, 2208 inst->header_present); 2209} 2210 2211void 2212fs_visitor::generate_linterp(fs_inst *inst, 2213 struct brw_reg dst, struct brw_reg *src) 2214{ 2215 struct brw_reg delta_x = src[0]; 2216 struct brw_reg delta_y = src[1]; 2217 struct brw_reg interp = src[2]; 2218 2219 if (brw->has_pln && 2220 delta_y.nr == delta_x.nr + 1 && 2221 (intel->gen >= 6 || (delta_x.nr & 1) == 0)) { 2222 brw_PLN(p, dst, interp, delta_x); 2223 } else { 2224 brw_LINE(p, brw_null_reg(), interp, delta_x); 2225 brw_MAC(p, dst, suboffset(interp, 1), delta_y); 2226 } 2227} 2228 2229void 2230fs_visitor::generate_math(fs_inst *inst, 2231 struct brw_reg dst, struct brw_reg *src) 2232{ 2233 int op; 2234 2235 switch (inst->opcode) { 2236 case FS_OPCODE_RCP: 2237 op = BRW_MATH_FUNCTION_INV; 2238 break; 2239 case FS_OPCODE_RSQ: 2240 op = BRW_MATH_FUNCTION_RSQ; 2241 break; 2242 case FS_OPCODE_SQRT: 2243 op = BRW_MATH_FUNCTION_SQRT; 2244 break; 2245 case FS_OPCODE_EXP2: 2246 op = BRW_MATH_FUNCTION_EXP; 2247 break; 2248 case FS_OPCODE_LOG2: 2249 op = BRW_MATH_FUNCTION_LOG; 2250 break; 2251 case FS_OPCODE_POW: 2252 op = BRW_MATH_FUNCTION_POW; 2253 break; 2254 case FS_OPCODE_SIN: 2255 op = BRW_MATH_FUNCTION_SIN; 2256 break; 2257 case FS_OPCODE_COS: 2258 op = BRW_MATH_FUNCTION_COS; 2259 break; 2260 default: 2261 assert(!"not reached: unknown math function"); 2262 op = 0; 2263 break; 2264 } 2265 2266 if (intel->gen >= 6) { 2267 assert(inst->mlen == 0); 2268 2269 if (inst->opcode == FS_OPCODE_POW) { 2270 brw_math2(p, dst, op, src[0], src[1]); 2271 } else { 2272 brw_math(p, dst, 2273 op, 2274 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 2275 BRW_MATH_SATURATE_NONE, 2276 0, src[0], 2277 BRW_MATH_DATA_VECTOR, 2278 BRW_MATH_PRECISION_FULL); 2279 } 2280 } else { 2281 assert(inst->mlen >= 1); 2282 2283 brw_math(p, dst, 2284 op, 2285 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 2286 BRW_MATH_SATURATE_NONE, 2287 inst->base_mrf, src[0], 2288 BRW_MATH_DATA_VECTOR, 2289 BRW_MATH_PRECISION_FULL); 2290 } 2291} 2292 2293void 2294fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 2295{ 2296 int msg_type = -1; 2297 int rlen = 4; 2298 uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; 2299 2300 if (intel->gen >= 5) { 2301 switch (inst->opcode) { 2302 case FS_OPCODE_TEX: 2303 if (inst->shadow_compare) { 2304 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5; 2305 } else { 2306 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5; 2307 } 2308 break; 2309 case FS_OPCODE_TXB: 2310 if (inst->shadow_compare) { 2311 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE_GEN5; 2312 } else { 2313 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5; 2314 } 2315 break; 2316 case FS_OPCODE_TXL: 2317 if (inst->shadow_compare) { 2318 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE_GEN5; 2319 } else { 2320 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_LOD_GEN5; 2321 } 2322 break; 2323 case FS_OPCODE_TXD: 2324 assert(!"TXD isn't supported on gen5+ yet."); 2325 break; 2326 } 2327 } else { 2328 switch (inst->opcode) { 2329 case FS_OPCODE_TEX: 2330 /* Note that G45 and older determines shadow compare and dispatch width 2331 * from message length for most messages. 2332 */ 2333 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE; 2334 if (inst->shadow_compare) { 2335 assert(inst->mlen == 6); 2336 } else { 2337 assert(inst->mlen <= 4); 2338 } 2339 break; 2340 case FS_OPCODE_TXB: 2341 if (inst->shadow_compare) { 2342 assert(inst->mlen == 6); 2343 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE; 2344 } else { 2345 assert(inst->mlen == 9); 2346 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS; 2347 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 2348 } 2349 break; 2350 case FS_OPCODE_TXL: 2351 if (inst->shadow_compare) { 2352 assert(inst->mlen == 6); 2353 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE; 2354 } else { 2355 assert(inst->mlen == 9); 2356 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD; 2357 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 2358 } 2359 break; 2360 case FS_OPCODE_TXD: 2361 assert(!"TXD isn't supported on gen4 yet."); 2362 break; 2363 } 2364 } 2365 assert(msg_type != -1); 2366 2367 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) { 2368 rlen = 8; 2369 dst = vec16(dst); 2370 } 2371 2372 brw_SAMPLE(p, 2373 retype(dst, BRW_REGISTER_TYPE_UW), 2374 inst->base_mrf, 2375 src, 2376 SURF_INDEX_TEXTURE(inst->sampler), 2377 inst->sampler, 2378 WRITEMASK_XYZW, 2379 msg_type, 2380 rlen, 2381 inst->mlen, 2382 0, 2383 1, 2384 simd_mode); 2385} 2386 2387 2388/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input 2389 * looking like: 2390 * 2391 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br 2392 * 2393 * and we're trying to produce: 2394 * 2395 * DDX DDY 2396 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl) 2397 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br) 2398 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl) 2399 * (ss0.br - ss0.bl) (ss0.tr - ss0.br) 2400 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl) 2401 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br) 2402 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl) 2403 * (ss1.br - ss1.bl) (ss1.tr - ss1.br) 2404 * 2405 * and add another set of two more subspans if in 16-pixel dispatch mode. 2406 * 2407 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result 2408 * for each pair, and vertstride = 2 jumps us 2 elements after processing a 2409 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled 2410 * between each other. We could probably do it like ddx and swizzle the right 2411 * order later, but bail for now and just produce 2412 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4) 2413 */ 2414void 2415fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 2416{ 2417 struct brw_reg src0 = brw_reg(src.file, src.nr, 1, 2418 BRW_REGISTER_TYPE_F, 2419 BRW_VERTICAL_STRIDE_2, 2420 BRW_WIDTH_2, 2421 BRW_HORIZONTAL_STRIDE_0, 2422 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2423 struct brw_reg src1 = brw_reg(src.file, src.nr, 0, 2424 BRW_REGISTER_TYPE_F, 2425 BRW_VERTICAL_STRIDE_2, 2426 BRW_WIDTH_2, 2427 BRW_HORIZONTAL_STRIDE_0, 2428 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2429 brw_ADD(p, dst, src0, negate(src1)); 2430} 2431 2432void 2433fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 2434{ 2435 struct brw_reg src0 = brw_reg(src.file, src.nr, 0, 2436 BRW_REGISTER_TYPE_F, 2437 BRW_VERTICAL_STRIDE_4, 2438 BRW_WIDTH_4, 2439 BRW_HORIZONTAL_STRIDE_0, 2440 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2441 struct brw_reg src1 = brw_reg(src.file, src.nr, 2, 2442 BRW_REGISTER_TYPE_F, 2443 BRW_VERTICAL_STRIDE_4, 2444 BRW_WIDTH_4, 2445 BRW_HORIZONTAL_STRIDE_0, 2446 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2447 brw_ADD(p, dst, src0, negate(src1)); 2448} 2449 2450void 2451fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask) 2452{ 2453 if (intel->gen >= 6) { 2454 /* Gen6 no longer has the mask reg for us to just read the 2455 * active channels from. However, cmp updates just the channels 2456 * of the flag reg that are enabled, so we can get at the 2457 * channel enables that way. In this step, make a reg of ones 2458 * we'll compare to. 2459 */ 2460 brw_MOV(p, mask, brw_imm_ud(1)); 2461 } else { 2462 brw_push_insn_state(p); 2463 brw_set_mask_control(p, BRW_MASK_DISABLE); 2464 brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */ 2465 brw_pop_insn_state(p); 2466 } 2467} 2468 2469void 2470fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask) 2471{ 2472 if (intel->gen >= 6) { 2473 struct brw_reg f0 = brw_flag_reg(); 2474 struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); 2475 2476 brw_push_insn_state(p); 2477 brw_set_mask_control(p, BRW_MASK_DISABLE); 2478 brw_MOV(p, f0, brw_imm_uw(0xffff)); /* inactive channels undiscarded */ 2479 brw_pop_insn_state(p); 2480 2481 brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), 2482 BRW_CONDITIONAL_Z, mask, brw_imm_ud(0)); /* active channels fail test */ 2483 /* Undo CMP's whacking of predication*/ 2484 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 2485 2486 brw_push_insn_state(p); 2487 brw_set_mask_control(p, BRW_MASK_DISABLE); 2488 brw_AND(p, g1, f0, g1); 2489 brw_pop_insn_state(p); 2490 } else { 2491 struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); 2492 2493 mask = brw_uw1_reg(mask.file, mask.nr, 0); 2494 2495 brw_push_insn_state(p); 2496 brw_set_mask_control(p, BRW_MASK_DISABLE); 2497 brw_AND(p, g0, mask, g0); 2498 brw_pop_insn_state(p); 2499 } 2500} 2501 2502void 2503fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src) 2504{ 2505 assert(inst->mlen != 0); 2506 2507 brw_MOV(p, 2508 retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD), 2509 retype(src, BRW_REGISTER_TYPE_UD)); 2510 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1, 2511 inst->offset); 2512} 2513 2514void 2515fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst) 2516{ 2517 assert(inst->mlen != 0); 2518 2519 /* Clear any post destination dependencies that would be ignored by 2520 * the block read. See the B-Spec for pre-gen5 send instruction. 2521 * 2522 * This could use a better solution, since texture sampling and 2523 * math reads could potentially run into it as well -- anywhere 2524 * that we have a SEND with a destination that is a register that 2525 * was written but not read within the last N instructions (what's 2526 * N? unsure). This is rare because of dead code elimination, but 2527 * not impossible. 2528 */ 2529 if (intel->gen == 4 && !intel->is_g4x) 2530 brw_MOV(p, brw_null_reg(), dst); 2531 2532 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1, 2533 inst->offset); 2534 2535 if (intel->gen == 4 && !intel->is_g4x) { 2536 /* gen4 errata: destination from a send can't be used as a 2537 * destination until it's been read. Just read it so we don't 2538 * have to worry. 2539 */ 2540 brw_MOV(p, brw_null_reg(), dst); 2541 } 2542} 2543 2544 2545void 2546fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst) 2547{ 2548 assert(inst->mlen != 0); 2549 2550 /* Clear any post destination dependencies that would be ignored by 2551 * the block read. See the B-Spec for pre-gen5 send instruction. 2552 * 2553 * This could use a better solution, since texture sampling and 2554 * math reads could potentially run into it as well -- anywhere 2555 * that we have a SEND with a destination that is a register that 2556 * was written but not read within the last N instructions (what's 2557 * N? unsure). This is rare because of dead code elimination, but 2558 * not impossible. 2559 */ 2560 if (intel->gen == 4 && !intel->is_g4x) 2561 brw_MOV(p, brw_null_reg(), dst); 2562 2563 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf), 2564 inst->offset, SURF_INDEX_FRAG_CONST_BUFFER); 2565 2566 if (intel->gen == 4 && !intel->is_g4x) { 2567 /* gen4 errata: destination from a send can't be used as a 2568 * destination until it's been read. Just read it so we don't 2569 * have to worry. 2570 */ 2571 brw_MOV(p, brw_null_reg(), dst); 2572 } 2573} 2574 2575/** 2576 * To be called after the last _mesa_add_state_reference() call, to 2577 * set up prog_data.param[] for assign_curb_setup() and 2578 * setup_pull_constants(). 2579 */ 2580void 2581fs_visitor::setup_paramvalues_refs() 2582{ 2583 /* Set up the pointers to ParamValues now that that array is finalized. */ 2584 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 2585 c->prog_data.param[i] = 2586 fp->Base.Parameters->ParameterValues[this->param_index[i]] + 2587 this->param_offset[i]; 2588 } 2589} 2590 2591void 2592fs_visitor::assign_curb_setup() 2593{ 2594 c->prog_data.first_curbe_grf = c->nr_payload_regs; 2595 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8; 2596 2597 /* Map the offsets in the UNIFORM file to fixed HW regs. */ 2598 foreach_iter(exec_list_iterator, iter, this->instructions) { 2599 fs_inst *inst = (fs_inst *)iter.get(); 2600 2601 for (unsigned int i = 0; i < 3; i++) { 2602 if (inst->src[i].file == UNIFORM) { 2603 int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 2604 struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf + 2605 constant_nr / 8, 2606 constant_nr % 8); 2607 2608 inst->src[i].file = FIXED_HW_REG; 2609 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type); 2610 } 2611 } 2612 } 2613} 2614 2615void 2616fs_visitor::calculate_urb_setup() 2617{ 2618 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 2619 urb_setup[i] = -1; 2620 } 2621 2622 int urb_next = 0; 2623 /* Figure out where each of the incoming setup attributes lands. */ 2624 if (intel->gen >= 6) { 2625 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 2626 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) { 2627 urb_setup[i] = urb_next++; 2628 } 2629 } 2630 } else { 2631 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */ 2632 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) { 2633 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) { 2634 int fp_index; 2635 2636 if (i >= VERT_RESULT_VAR0) 2637 fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0); 2638 else if (i <= VERT_RESULT_TEX7) 2639 fp_index = i; 2640 else 2641 fp_index = -1; 2642 2643 if (fp_index >= 0) 2644 urb_setup[fp_index] = urb_next++; 2645 } 2646 } 2647 } 2648 2649 /* Each attribute is 4 setup channels, each of which is half a reg. */ 2650 c->prog_data.urb_read_length = urb_next * 2; 2651} 2652 2653void 2654fs_visitor::assign_urb_setup() 2655{ 2656 int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length; 2657 2658 /* Offset all the urb_setup[] index by the actual position of the 2659 * setup regs, now that the location of the constants has been chosen. 2660 */ 2661 foreach_iter(exec_list_iterator, iter, this->instructions) { 2662 fs_inst *inst = (fs_inst *)iter.get(); 2663 2664 if (inst->opcode == FS_OPCODE_LINTERP) { 2665 assert(inst->src[2].file == FIXED_HW_REG); 2666 inst->src[2].fixed_hw_reg.nr += urb_start; 2667 } 2668 2669 if (inst->opcode == FS_OPCODE_CINTERP) { 2670 assert(inst->src[0].file == FIXED_HW_REG); 2671 inst->src[0].fixed_hw_reg.nr += urb_start; 2672 } 2673 } 2674 2675 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length; 2676} 2677 2678/** 2679 * Split large virtual GRFs into separate components if we can. 2680 * 2681 * This is mostly duplicated with what brw_fs_vector_splitting does, 2682 * but that's really conservative because it's afraid of doing 2683 * splitting that doesn't result in real progress after the rest of 2684 * the optimization phases, which would cause infinite looping in 2685 * optimization. We can do it once here, safely. This also has the 2686 * opportunity to split interpolated values, or maybe even uniforms, 2687 * which we don't have at the IR level. 2688 * 2689 * We want to split, because virtual GRFs are what we register 2690 * allocate and spill (due to contiguousness requirements for some 2691 * instructions), and they're what we naturally generate in the 2692 * codegen process, but most virtual GRFs don't actually need to be 2693 * contiguous sets of GRFs. If we split, we'll end up with reduced 2694 * live intervals and better dead code elimination and coalescing. 2695 */ 2696void 2697fs_visitor::split_virtual_grfs() 2698{ 2699 int num_vars = this->virtual_grf_next; 2700 bool split_grf[num_vars]; 2701 int new_virtual_grf[num_vars]; 2702 2703 /* Try to split anything > 0 sized. */ 2704 for (int i = 0; i < num_vars; i++) { 2705 if (this->virtual_grf_sizes[i] != 1) 2706 split_grf[i] = true; 2707 else 2708 split_grf[i] = false; 2709 } 2710 2711 if (brw->has_pln) { 2712 /* PLN opcodes rely on the delta_xy being contiguous. */ 2713 split_grf[this->delta_x.reg] = false; 2714 } 2715 2716 foreach_iter(exec_list_iterator, iter, this->instructions) { 2717 fs_inst *inst = (fs_inst *)iter.get(); 2718 2719 /* Texturing produces 4 contiguous registers, so no splitting. */ 2720 if (inst->is_tex()) { 2721 split_grf[inst->dst.reg] = false; 2722 } 2723 } 2724 2725 /* Allocate new space for split regs. Note that the virtual 2726 * numbers will be contiguous. 2727 */ 2728 for (int i = 0; i < num_vars; i++) { 2729 if (split_grf[i]) { 2730 new_virtual_grf[i] = virtual_grf_alloc(1); 2731 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) { 2732 int reg = virtual_grf_alloc(1); 2733 assert(reg == new_virtual_grf[i] + j - 1); 2734 (void) reg; 2735 } 2736 this->virtual_grf_sizes[i] = 1; 2737 } 2738 } 2739 2740 foreach_iter(exec_list_iterator, iter, this->instructions) { 2741 fs_inst *inst = (fs_inst *)iter.get(); 2742 2743 if (inst->dst.file == GRF && 2744 split_grf[inst->dst.reg] && 2745 inst->dst.reg_offset != 0) { 2746 inst->dst.reg = (new_virtual_grf[inst->dst.reg] + 2747 inst->dst.reg_offset - 1); 2748 inst->dst.reg_offset = 0; 2749 } 2750 for (int i = 0; i < 3; i++) { 2751 if (inst->src[i].file == GRF && 2752 split_grf[inst->src[i].reg] && 2753 inst->src[i].reg_offset != 0) { 2754 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] + 2755 inst->src[i].reg_offset - 1); 2756 inst->src[i].reg_offset = 0; 2757 } 2758 } 2759 } 2760 this->live_intervals_valid = false; 2761} 2762 2763/** 2764 * Choose accesses from the UNIFORM file to demote to using the pull 2765 * constant buffer. 2766 * 2767 * We allow a fragment shader to have more than the specified minimum 2768 * maximum number of fragment shader uniform components (64). If 2769 * there are too many of these, they'd fill up all of register space. 2770 * So, this will push some of them out to the pull constant buffer and 2771 * update the program to load them. 2772 */ 2773void 2774fs_visitor::setup_pull_constants() 2775{ 2776 /* Only allow 16 registers (128 uniform components) as push constants. */ 2777 unsigned int max_uniform_components = 16 * 8; 2778 if (c->prog_data.nr_params <= max_uniform_components) 2779 return; 2780 2781 /* Just demote the end of the list. We could probably do better 2782 * here, demoting things that are rarely used in the program first. 2783 */ 2784 int pull_uniform_base = max_uniform_components; 2785 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base; 2786 2787 foreach_iter(exec_list_iterator, iter, this->instructions) { 2788 fs_inst *inst = (fs_inst *)iter.get(); 2789 2790 for (int i = 0; i < 3; i++) { 2791 if (inst->src[i].file != UNIFORM) 2792 continue; 2793 2794 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 2795 if (uniform_nr < pull_uniform_base) 2796 continue; 2797 2798 fs_reg dst = fs_reg(this, glsl_type::float_type); 2799 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD, 2800 dst); 2801 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15; 2802 pull->ir = inst->ir; 2803 pull->annotation = inst->annotation; 2804 pull->base_mrf = 14; 2805 pull->mlen = 1; 2806 2807 inst->insert_before(pull); 2808 2809 inst->src[i].file = GRF; 2810 inst->src[i].reg = dst.reg; 2811 inst->src[i].reg_offset = 0; 2812 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3; 2813 } 2814 } 2815 2816 for (int i = 0; i < pull_uniform_count; i++) { 2817 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i]; 2818 c->prog_data.pull_param_convert[i] = 2819 c->prog_data.param_convert[pull_uniform_base + i]; 2820 } 2821 c->prog_data.nr_params -= pull_uniform_count; 2822 c->prog_data.nr_pull_params = pull_uniform_count; 2823} 2824 2825void 2826fs_visitor::calculate_live_intervals() 2827{ 2828 int num_vars = this->virtual_grf_next; 2829 int *def = ralloc_array(mem_ctx, int, num_vars); 2830 int *use = ralloc_array(mem_ctx, int, num_vars); 2831 int loop_depth = 0; 2832 int loop_start = 0; 2833 int bb_header_ip = 0; 2834 2835 if (this->live_intervals_valid) 2836 return; 2837 2838 for (int i = 0; i < num_vars; i++) { 2839 def[i] = MAX_INSTRUCTION; 2840 use[i] = -1; 2841 } 2842 2843 int ip = 0; 2844 foreach_iter(exec_list_iterator, iter, this->instructions) { 2845 fs_inst *inst = (fs_inst *)iter.get(); 2846 2847 if (inst->opcode == BRW_OPCODE_DO) { 2848 if (loop_depth++ == 0) 2849 loop_start = ip; 2850 } else if (inst->opcode == BRW_OPCODE_WHILE) { 2851 loop_depth--; 2852 2853 if (loop_depth == 0) { 2854 /* Patches up the use of vars marked for being live across 2855 * the whole loop. 2856 */ 2857 for (int i = 0; i < num_vars; i++) { 2858 if (use[i] == loop_start) { 2859 use[i] = ip; 2860 } 2861 } 2862 } 2863 } else { 2864 for (unsigned int i = 0; i < 3; i++) { 2865 if (inst->src[i].file == GRF && inst->src[i].reg != 0) { 2866 int reg = inst->src[i].reg; 2867 2868 if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 && 2869 def[reg] >= bb_header_ip)) { 2870 use[reg] = ip; 2871 } else { 2872 def[reg] = MIN2(loop_start, def[reg]); 2873 use[reg] = loop_start; 2874 2875 /* Nobody else is going to go smash our start to 2876 * later in the loop now, because def[reg] now 2877 * points before the bb header. 2878 */ 2879 } 2880 } 2881 } 2882 if (inst->dst.file == GRF && inst->dst.reg != 0) { 2883 int reg = inst->dst.reg; 2884 2885 if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 && 2886 !inst->predicated)) { 2887 def[reg] = MIN2(def[reg], ip); 2888 } else { 2889 def[reg] = MIN2(def[reg], loop_start); 2890 } 2891 } 2892 } 2893 2894 ip++; 2895 2896 /* Set the basic block header IP. This is used for determining 2897 * if a complete def of single-register virtual GRF in a loop 2898 * dominates a use in the same basic block. It's a quick way to 2899 * reduce the live interval range of most register used in a 2900 * loop. 2901 */ 2902 if (inst->opcode == BRW_OPCODE_IF || 2903 inst->opcode == BRW_OPCODE_ELSE || 2904 inst->opcode == BRW_OPCODE_ENDIF || 2905 inst->opcode == BRW_OPCODE_DO || 2906 inst->opcode == BRW_OPCODE_WHILE || 2907 inst->opcode == BRW_OPCODE_BREAK || 2908 inst->opcode == BRW_OPCODE_CONTINUE) { 2909 bb_header_ip = ip; 2910 } 2911 } 2912 2913 ralloc_free(this->virtual_grf_def); 2914 ralloc_free(this->virtual_grf_use); 2915 this->virtual_grf_def = def; 2916 this->virtual_grf_use = use; 2917 2918 this->live_intervals_valid = true; 2919} 2920 2921/** 2922 * Attempts to move immediate constants into the immediate 2923 * constant slot of following instructions. 2924 * 2925 * Immediate constants are a bit tricky -- they have to be in the last 2926 * operand slot, you can't do abs/negate on them, 2927 */ 2928 2929bool 2930fs_visitor::propagate_constants() 2931{ 2932 bool progress = false; 2933 2934 calculate_live_intervals(); 2935 2936 foreach_iter(exec_list_iterator, iter, this->instructions) { 2937 fs_inst *inst = (fs_inst *)iter.get(); 2938 2939 if (inst->opcode != BRW_OPCODE_MOV || 2940 inst->predicated || 2941 inst->dst.file != GRF || inst->src[0].file != IMM || 2942 inst->dst.type != inst->src[0].type) 2943 continue; 2944 2945 /* Don't bother with cases where we should have had the 2946 * operation on the constant folded in GLSL already. 2947 */ 2948 if (inst->saturate) 2949 continue; 2950 2951 /* Found a move of a constant to a GRF. Find anything else using the GRF 2952 * before it's written, and replace it with the constant if we can. 2953 */ 2954 exec_list_iterator scan_iter = iter; 2955 scan_iter.next(); 2956 for (; scan_iter.has_next(); scan_iter.next()) { 2957 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 2958 2959 if (scan_inst->opcode == BRW_OPCODE_DO || 2960 scan_inst->opcode == BRW_OPCODE_WHILE || 2961 scan_inst->opcode == BRW_OPCODE_ELSE || 2962 scan_inst->opcode == BRW_OPCODE_ENDIF) { 2963 break; 2964 } 2965 2966 for (int i = 2; i >= 0; i--) { 2967 if (scan_inst->src[i].file != GRF || 2968 scan_inst->src[i].reg != inst->dst.reg || 2969 scan_inst->src[i].reg_offset != inst->dst.reg_offset) 2970 continue; 2971 2972 /* Don't bother with cases where we should have had the 2973 * operation on the constant folded in GLSL already. 2974 */ 2975 if (scan_inst->src[i].negate || scan_inst->src[i].abs) 2976 continue; 2977 2978 switch (scan_inst->opcode) { 2979 case BRW_OPCODE_MOV: 2980 scan_inst->src[i] = inst->src[0]; 2981 progress = true; 2982 break; 2983 2984 case BRW_OPCODE_MUL: 2985 case BRW_OPCODE_ADD: 2986 if (i == 1) { 2987 scan_inst->src[i] = inst->src[0]; 2988 progress = true; 2989 } else if (i == 0 && scan_inst->src[1].file != IMM) { 2990 /* Fit this constant in by commuting the operands */ 2991 scan_inst->src[0] = scan_inst->src[1]; 2992 scan_inst->src[1] = inst->src[0]; 2993 progress = true; 2994 } 2995 break; 2996 case BRW_OPCODE_CMP: 2997 case BRW_OPCODE_SEL: 2998 if (i == 1) { 2999 scan_inst->src[i] = inst->src[0]; 3000 progress = true; 3001 } 3002 } 3003 } 3004 3005 if (scan_inst->dst.file == GRF && 3006 scan_inst->dst.reg == inst->dst.reg && 3007 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 3008 scan_inst->is_tex())) { 3009 break; 3010 } 3011 } 3012 } 3013 3014 if (progress) 3015 this->live_intervals_valid = false; 3016 3017 return progress; 3018} 3019/** 3020 * Must be called after calculate_live_intervales() to remove unused 3021 * writes to registers -- register allocation will fail otherwise 3022 * because something deffed but not used won't be considered to 3023 * interfere with other regs. 3024 */ 3025bool 3026fs_visitor::dead_code_eliminate() 3027{ 3028 bool progress = false; 3029 int pc = 0; 3030 3031 calculate_live_intervals(); 3032 3033 foreach_iter(exec_list_iterator, iter, this->instructions) { 3034 fs_inst *inst = (fs_inst *)iter.get(); 3035 3036 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) { 3037 inst->remove(); 3038 progress = true; 3039 } 3040 3041 pc++; 3042 } 3043 3044 if (progress) 3045 live_intervals_valid = false; 3046 3047 return progress; 3048} 3049 3050bool 3051fs_visitor::register_coalesce() 3052{ 3053 bool progress = false; 3054 int if_depth = 0; 3055 int loop_depth = 0; 3056 3057 foreach_iter(exec_list_iterator, iter, this->instructions) { 3058 fs_inst *inst = (fs_inst *)iter.get(); 3059 3060 /* Make sure that we dominate the instructions we're going to 3061 * scan for interfering with our coalescing, or we won't have 3062 * scanned enough to see if anything interferes with our 3063 * coalescing. We don't dominate the following instructions if 3064 * we're in a loop or an if block. 3065 */ 3066 switch (inst->opcode) { 3067 case BRW_OPCODE_DO: 3068 loop_depth++; 3069 break; 3070 case BRW_OPCODE_WHILE: 3071 loop_depth--; 3072 break; 3073 case BRW_OPCODE_IF: 3074 if_depth++; 3075 break; 3076 case BRW_OPCODE_ENDIF: 3077 if_depth--; 3078 break; 3079 } 3080 if (loop_depth || if_depth) 3081 continue; 3082 3083 if (inst->opcode != BRW_OPCODE_MOV || 3084 inst->predicated || 3085 inst->saturate || 3086 inst->dst.file != GRF || inst->src[0].file != GRF || 3087 inst->dst.type != inst->src[0].type) 3088 continue; 3089 3090 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate; 3091 3092 /* Found a move of a GRF to a GRF. Let's see if we can coalesce 3093 * them: check for no writes to either one until the exit of the 3094 * program. 3095 */ 3096 bool interfered = false; 3097 exec_list_iterator scan_iter = iter; 3098 scan_iter.next(); 3099 for (; scan_iter.has_next(); scan_iter.next()) { 3100 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 3101 3102 if (scan_inst->dst.file == GRF) { 3103 if (scan_inst->dst.reg == inst->dst.reg && 3104 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 3105 scan_inst->is_tex())) { 3106 interfered = true; 3107 break; 3108 } 3109 if (scan_inst->dst.reg == inst->src[0].reg && 3110 (scan_inst->dst.reg_offset == inst->src[0].reg_offset || 3111 scan_inst->is_tex())) { 3112 interfered = true; 3113 break; 3114 } 3115 } 3116 3117 /* The gen6 MATH instruction can't handle source modifiers, so avoid 3118 * coalescing those for now. We should do something more specific. 3119 */ 3120 if (intel->gen == 6 && scan_inst->is_math() && has_source_modifiers) { 3121 interfered = true; 3122 break; 3123 } 3124 } 3125 if (interfered) { 3126 continue; 3127 } 3128 3129 /* Rewrite the later usage to point at the source of the move to 3130 * be removed. 3131 */ 3132 for (exec_list_iterator scan_iter = iter; scan_iter.has_next(); 3133 scan_iter.next()) { 3134 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 3135 3136 for (int i = 0; i < 3; i++) { 3137 if (scan_inst->src[i].file == GRF && 3138 scan_inst->src[i].reg == inst->dst.reg && 3139 scan_inst->src[i].reg_offset == inst->dst.reg_offset) { 3140 scan_inst->src[i].reg = inst->src[0].reg; 3141 scan_inst->src[i].reg_offset = inst->src[0].reg_offset; 3142 scan_inst->src[i].abs |= inst->src[0].abs; 3143 scan_inst->src[i].negate ^= inst->src[0].negate; 3144 scan_inst->src[i].smear = inst->src[0].smear; 3145 } 3146 } 3147 } 3148 3149 inst->remove(); 3150 progress = true; 3151 } 3152 3153 if (progress) 3154 live_intervals_valid = false; 3155 3156 return progress; 3157} 3158 3159 3160bool 3161fs_visitor::compute_to_mrf() 3162{ 3163 bool progress = false; 3164 int next_ip = 0; 3165 3166 calculate_live_intervals(); 3167 3168 foreach_iter(exec_list_iterator, iter, this->instructions) { 3169 fs_inst *inst = (fs_inst *)iter.get(); 3170 3171 int ip = next_ip; 3172 next_ip++; 3173 3174 if (inst->opcode != BRW_OPCODE_MOV || 3175 inst->predicated || 3176 inst->dst.file != MRF || inst->src[0].file != GRF || 3177 inst->dst.type != inst->src[0].type || 3178 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1) 3179 continue; 3180 3181 /* Can't compute-to-MRF this GRF if someone else was going to 3182 * read it later. 3183 */ 3184 if (this->virtual_grf_use[inst->src[0].reg] > ip) 3185 continue; 3186 3187 /* Found a move of a GRF to a MRF. Let's see if we can go 3188 * rewrite the thing that made this GRF to write into the MRF. 3189 */ 3190 fs_inst *scan_inst; 3191 for (scan_inst = (fs_inst *)inst->prev; 3192 scan_inst->prev != NULL; 3193 scan_inst = (fs_inst *)scan_inst->prev) { 3194 if (scan_inst->dst.file == GRF && 3195 scan_inst->dst.reg == inst->src[0].reg) { 3196 /* Found the last thing to write our reg we want to turn 3197 * into a compute-to-MRF. 3198 */ 3199 3200 if (scan_inst->is_tex()) { 3201 /* texturing writes several continuous regs, so we can't 3202 * compute-to-mrf that. 3203 */ 3204 break; 3205 } 3206 3207 /* If it's predicated, it (probably) didn't populate all 3208 * the channels. 3209 */ 3210 if (scan_inst->predicated) 3211 break; 3212 3213 /* SEND instructions can't have MRF as a destination. */ 3214 if (scan_inst->mlen) 3215 break; 3216 3217 if (intel->gen >= 6) { 3218 /* gen6 math instructions must have the destination be 3219 * GRF, so no compute-to-MRF for them. 3220 */ 3221 if (scan_inst->is_math()) { 3222 break; 3223 } 3224 } 3225 3226 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 3227 /* Found the creator of our MRF's source value. */ 3228 scan_inst->dst.file = MRF; 3229 scan_inst->dst.hw_reg = inst->dst.hw_reg; 3230 scan_inst->saturate |= inst->saturate; 3231 inst->remove(); 3232 progress = true; 3233 } 3234 break; 3235 } 3236 3237 /* We don't handle flow control here. Most computation of 3238 * values that end up in MRFs are shortly before the MRF 3239 * write anyway. 3240 */ 3241 if (scan_inst->opcode == BRW_OPCODE_DO || 3242 scan_inst->opcode == BRW_OPCODE_WHILE || 3243 scan_inst->opcode == BRW_OPCODE_ELSE || 3244 scan_inst->opcode == BRW_OPCODE_ENDIF) { 3245 break; 3246 } 3247 3248 /* You can't read from an MRF, so if someone else reads our 3249 * MRF's source GRF that we wanted to rewrite, that stops us. 3250 */ 3251 bool interfered = false; 3252 for (int i = 0; i < 3; i++) { 3253 if (scan_inst->src[i].file == GRF && 3254 scan_inst->src[i].reg == inst->src[0].reg && 3255 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { 3256 interfered = true; 3257 } 3258 } 3259 if (interfered) 3260 break; 3261 3262 if (scan_inst->dst.file == MRF && 3263 scan_inst->dst.hw_reg == inst->dst.hw_reg) { 3264 /* Somebody else wrote our MRF here, so we can't can't 3265 * compute-to-MRF before that. 3266 */ 3267 break; 3268 } 3269 3270 if (scan_inst->mlen > 0) { 3271 /* Found a SEND instruction, which means that there are 3272 * live values in MRFs from base_mrf to base_mrf + 3273 * scan_inst->mlen - 1. Don't go pushing our MRF write up 3274 * above it. 3275 */ 3276 if (inst->dst.hw_reg >= scan_inst->base_mrf && 3277 inst->dst.hw_reg < scan_inst->base_mrf + scan_inst->mlen) { 3278 break; 3279 } 3280 } 3281 } 3282 } 3283 3284 return progress; 3285} 3286 3287/** 3288 * Walks through basic blocks, locking for repeated MRF writes and 3289 * removing the later ones. 3290 */ 3291bool 3292fs_visitor::remove_duplicate_mrf_writes() 3293{ 3294 fs_inst *last_mrf_move[16]; 3295 bool progress = false; 3296 3297 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 3298 3299 foreach_iter(exec_list_iterator, iter, this->instructions) { 3300 fs_inst *inst = (fs_inst *)iter.get(); 3301 3302 switch (inst->opcode) { 3303 case BRW_OPCODE_DO: 3304 case BRW_OPCODE_WHILE: 3305 case BRW_OPCODE_IF: 3306 case BRW_OPCODE_ELSE: 3307 case BRW_OPCODE_ENDIF: 3308 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 3309 continue; 3310 default: 3311 break; 3312 } 3313 3314 if (inst->opcode == BRW_OPCODE_MOV && 3315 inst->dst.file == MRF) { 3316 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg]; 3317 if (prev_inst && inst->equals(prev_inst)) { 3318 inst->remove(); 3319 progress = true; 3320 continue; 3321 } 3322 } 3323 3324 /* Clear out the last-write records for MRFs that were overwritten. */ 3325 if (inst->dst.file == MRF) { 3326 last_mrf_move[inst->dst.hw_reg] = NULL; 3327 } 3328 3329 if (inst->mlen > 0) { 3330 /* Found a SEND instruction, which will include two or fewer 3331 * implied MRF writes. We could do better here. 3332 */ 3333 for (int i = 0; i < implied_mrf_writes(inst); i++) { 3334 last_mrf_move[inst->base_mrf + i] = NULL; 3335 } 3336 } 3337 3338 /* Clear out any MRF move records whose sources got overwritten. */ 3339 if (inst->dst.file == GRF) { 3340 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) { 3341 if (last_mrf_move[i] && 3342 last_mrf_move[i]->src[0].reg == inst->dst.reg) { 3343 last_mrf_move[i] = NULL; 3344 } 3345 } 3346 } 3347 3348 if (inst->opcode == BRW_OPCODE_MOV && 3349 inst->dst.file == MRF && 3350 inst->src[0].file == GRF && 3351 !inst->predicated) { 3352 last_mrf_move[inst->dst.hw_reg] = inst; 3353 } 3354 } 3355 3356 return progress; 3357} 3358 3359bool 3360fs_visitor::virtual_grf_interferes(int a, int b) 3361{ 3362 int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]); 3363 int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]); 3364 3365 /* We can't handle dead register writes here, without iterating 3366 * over the whole instruction stream to find every single dead 3367 * write to that register to compare to the live interval of the 3368 * other register. Just assert that dead_code_eliminate() has been 3369 * called. 3370 */ 3371 assert((this->virtual_grf_use[a] != -1 || 3372 this->virtual_grf_def[a] == MAX_INSTRUCTION) && 3373 (this->virtual_grf_use[b] != -1 || 3374 this->virtual_grf_def[b] == MAX_INSTRUCTION)); 3375 3376 return start < end; 3377} 3378 3379static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg) 3380{ 3381 struct brw_reg brw_reg; 3382 3383 switch (reg->file) { 3384 case GRF: 3385 case ARF: 3386 case MRF: 3387 if (reg->smear == -1) { 3388 brw_reg = brw_vec8_reg(reg->file, 3389 reg->hw_reg, 0); 3390 } else { 3391 brw_reg = brw_vec1_reg(reg->file, 3392 reg->hw_reg, reg->smear); 3393 } 3394 brw_reg = retype(brw_reg, reg->type); 3395 break; 3396 case IMM: 3397 switch (reg->type) { 3398 case BRW_REGISTER_TYPE_F: 3399 brw_reg = brw_imm_f(reg->imm.f); 3400 break; 3401 case BRW_REGISTER_TYPE_D: 3402 brw_reg = brw_imm_d(reg->imm.i); 3403 break; 3404 case BRW_REGISTER_TYPE_UD: 3405 brw_reg = brw_imm_ud(reg->imm.u); 3406 break; 3407 default: 3408 assert(!"not reached"); 3409 brw_reg = brw_null_reg(); 3410 break; 3411 } 3412 break; 3413 case FIXED_HW_REG: 3414 brw_reg = reg->fixed_hw_reg; 3415 break; 3416 case BAD_FILE: 3417 /* Probably unused. */ 3418 brw_reg = brw_null_reg(); 3419 break; 3420 case UNIFORM: 3421 assert(!"not reached"); 3422 brw_reg = brw_null_reg(); 3423 break; 3424 default: 3425 assert(!"not reached"); 3426 brw_reg = brw_null_reg(); 3427 break; 3428 } 3429 if (reg->abs) 3430 brw_reg = brw_abs(brw_reg); 3431 if (reg->negate) 3432 brw_reg = negate(brw_reg); 3433 3434 return brw_reg; 3435} 3436 3437void 3438fs_visitor::generate_code() 3439{ 3440 int last_native_inst = 0; 3441 const char *last_annotation_string = NULL; 3442 ir_instruction *last_annotation_ir = NULL; 3443 3444 int if_stack_array_size = 16; 3445 int loop_stack_array_size = 16; 3446 int if_stack_depth = 0, loop_stack_depth = 0; 3447 brw_instruction **if_stack = 3448 rzalloc_array(this->mem_ctx, brw_instruction *, if_stack_array_size); 3449 brw_instruction **loop_stack = 3450 rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size); 3451 int *if_depth_in_loop = 3452 rzalloc_array(this->mem_ctx, int, loop_stack_array_size); 3453 3454 3455 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3456 printf("Native code for fragment shader %d:\n", 3457 ctx->Shader.CurrentFragmentProgram->Name); 3458 } 3459 3460 foreach_iter(exec_list_iterator, iter, this->instructions) { 3461 fs_inst *inst = (fs_inst *)iter.get(); 3462 struct brw_reg src[3], dst; 3463 3464 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3465 if (last_annotation_ir != inst->ir) { 3466 last_annotation_ir = inst->ir; 3467 if (last_annotation_ir) { 3468 printf(" "); 3469 last_annotation_ir->print(); 3470 printf("\n"); 3471 } 3472 } 3473 if (last_annotation_string != inst->annotation) { 3474 last_annotation_string = inst->annotation; 3475 if (last_annotation_string) 3476 printf(" %s\n", last_annotation_string); 3477 } 3478 } 3479 3480 for (unsigned int i = 0; i < 3; i++) { 3481 src[i] = brw_reg_from_fs_reg(&inst->src[i]); 3482 } 3483 dst = brw_reg_from_fs_reg(&inst->dst); 3484 3485 brw_set_conditionalmod(p, inst->conditional_mod); 3486 brw_set_predicate_control(p, inst->predicated); 3487 brw_set_saturate(p, inst->saturate); 3488 3489 switch (inst->opcode) { 3490 case BRW_OPCODE_MOV: 3491 brw_MOV(p, dst, src[0]); 3492 break; 3493 case BRW_OPCODE_ADD: 3494 brw_ADD(p, dst, src[0], src[1]); 3495 break; 3496 case BRW_OPCODE_MUL: 3497 brw_MUL(p, dst, src[0], src[1]); 3498 break; 3499 3500 case BRW_OPCODE_FRC: 3501 brw_FRC(p, dst, src[0]); 3502 break; 3503 case BRW_OPCODE_RNDD: 3504 brw_RNDD(p, dst, src[0]); 3505 break; 3506 case BRW_OPCODE_RNDE: 3507 brw_RNDE(p, dst, src[0]); 3508 break; 3509 case BRW_OPCODE_RNDZ: 3510 brw_RNDZ(p, dst, src[0]); 3511 break; 3512 3513 case BRW_OPCODE_AND: 3514 brw_AND(p, dst, src[0], src[1]); 3515 break; 3516 case BRW_OPCODE_OR: 3517 brw_OR(p, dst, src[0], src[1]); 3518 break; 3519 case BRW_OPCODE_XOR: 3520 brw_XOR(p, dst, src[0], src[1]); 3521 break; 3522 case BRW_OPCODE_NOT: 3523 brw_NOT(p, dst, src[0]); 3524 break; 3525 case BRW_OPCODE_ASR: 3526 brw_ASR(p, dst, src[0], src[1]); 3527 break; 3528 case BRW_OPCODE_SHR: 3529 brw_SHR(p, dst, src[0], src[1]); 3530 break; 3531 case BRW_OPCODE_SHL: 3532 brw_SHL(p, dst, src[0], src[1]); 3533 break; 3534 3535 case BRW_OPCODE_CMP: 3536 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); 3537 break; 3538 case BRW_OPCODE_SEL: 3539 brw_SEL(p, dst, src[0], src[1]); 3540 break; 3541 3542 case BRW_OPCODE_IF: 3543 if (inst->src[0].file != BAD_FILE) { 3544 assert(intel->gen >= 6); 3545 if_stack[if_stack_depth] = brw_IF_gen6(p, inst->conditional_mod, src[0], src[1]); 3546 } else { 3547 if_stack[if_stack_depth] = brw_IF(p, BRW_EXECUTE_8); 3548 } 3549 if_depth_in_loop[loop_stack_depth]++; 3550 if_stack_depth++; 3551 if (if_stack_array_size <= if_stack_depth) { 3552 if_stack_array_size *= 2; 3553 if_stack = reralloc(this->mem_ctx, if_stack, brw_instruction *, 3554 if_stack_array_size); 3555 } 3556 break; 3557 3558 case BRW_OPCODE_ELSE: 3559 if_stack[if_stack_depth - 1] = 3560 brw_ELSE(p, if_stack[if_stack_depth - 1]); 3561 break; 3562 case BRW_OPCODE_ENDIF: 3563 if_stack_depth--; 3564 brw_ENDIF(p , if_stack[if_stack_depth]); 3565 if_depth_in_loop[loop_stack_depth]--; 3566 break; 3567 3568 case BRW_OPCODE_DO: 3569 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8); 3570 if (loop_stack_array_size <= loop_stack_depth) { 3571 loop_stack_array_size *= 2; 3572 loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *, 3573 loop_stack_array_size); 3574 if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int, 3575 loop_stack_array_size); 3576 } 3577 if_depth_in_loop[loop_stack_depth] = 0; 3578 break; 3579 3580 case BRW_OPCODE_BREAK: 3581 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]); 3582 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 3583 break; 3584 case BRW_OPCODE_CONTINUE: 3585 /* FINISHME: We need to write the loop instruction support still. */ 3586 if (intel->gen >= 6) 3587 brw_CONT_gen6(p, loop_stack[loop_stack_depth - 1]); 3588 else 3589 brw_CONT(p, if_depth_in_loop[loop_stack_depth]); 3590 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 3591 break; 3592 3593 case BRW_OPCODE_WHILE: { 3594 struct brw_instruction *inst0, *inst1; 3595 GLuint br = 1; 3596 3597 if (intel->gen >= 5) 3598 br = 2; 3599 3600 assert(loop_stack_depth > 0); 3601 loop_stack_depth--; 3602 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]); 3603 if (intel->gen < 6) { 3604 /* patch all the BREAK/CONT instructions from last BGNLOOP */ 3605 while (inst0 > loop_stack[loop_stack_depth]) { 3606 inst0--; 3607 if (inst0->header.opcode == BRW_OPCODE_BREAK && 3608 inst0->bits3.if_else.jump_count == 0) { 3609 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1); 3610 } 3611 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE && 3612 inst0->bits3.if_else.jump_count == 0) { 3613 inst0->bits3.if_else.jump_count = br * (inst1 - inst0); 3614 } 3615 } 3616 } 3617 } 3618 break; 3619 3620 case FS_OPCODE_RCP: 3621 case FS_OPCODE_RSQ: 3622 case FS_OPCODE_SQRT: 3623 case FS_OPCODE_EXP2: 3624 case FS_OPCODE_LOG2: 3625 case FS_OPCODE_POW: 3626 case FS_OPCODE_SIN: 3627 case FS_OPCODE_COS: 3628 generate_math(inst, dst, src); 3629 break; 3630 case FS_OPCODE_CINTERP: 3631 brw_MOV(p, dst, src[0]); 3632 break; 3633 case FS_OPCODE_LINTERP: 3634 generate_linterp(inst, dst, src); 3635 break; 3636 case FS_OPCODE_TEX: 3637 case FS_OPCODE_TXB: 3638 case FS_OPCODE_TXD: 3639 case FS_OPCODE_TXL: 3640 generate_tex(inst, dst, src[0]); 3641 break; 3642 case FS_OPCODE_DISCARD_NOT: 3643 generate_discard_not(inst, dst); 3644 break; 3645 case FS_OPCODE_DISCARD_AND: 3646 generate_discard_and(inst, src[0]); 3647 break; 3648 case FS_OPCODE_DDX: 3649 generate_ddx(inst, dst, src[0]); 3650 break; 3651 case FS_OPCODE_DDY: 3652 generate_ddy(inst, dst, src[0]); 3653 break; 3654 3655 case FS_OPCODE_SPILL: 3656 generate_spill(inst, src[0]); 3657 break; 3658 3659 case FS_OPCODE_UNSPILL: 3660 generate_unspill(inst, dst); 3661 break; 3662 3663 case FS_OPCODE_PULL_CONSTANT_LOAD: 3664 generate_pull_constant_load(inst, dst); 3665 break; 3666 3667 case FS_OPCODE_FB_WRITE: 3668 generate_fb_write(inst); 3669 break; 3670 default: 3671 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) { 3672 _mesa_problem(ctx, "Unsupported opcode `%s' in FS", 3673 brw_opcodes[inst->opcode].name); 3674 } else { 3675 _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode); 3676 } 3677 this->fail = true; 3678 } 3679 3680 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3681 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) { 3682 if (0) { 3683 printf("0x%08x 0x%08x 0x%08x 0x%08x ", 3684 ((uint32_t *)&p->store[i])[3], 3685 ((uint32_t *)&p->store[i])[2], 3686 ((uint32_t *)&p->store[i])[1], 3687 ((uint32_t *)&p->store[i])[0]); 3688 } 3689 brw_disasm(stdout, &p->store[i], intel->gen); 3690 } 3691 } 3692 3693 last_native_inst = p->nr_insn; 3694 } 3695 3696 ralloc_free(if_stack); 3697 ralloc_free(loop_stack); 3698 ralloc_free(if_depth_in_loop); 3699 3700 brw_set_uip_jip(p); 3701 3702 /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS 3703 * emit issues, it doesn't get the jump distances into the output, 3704 * which is often something we want to debug. So this is here in 3705 * case you're doing that. 3706 */ 3707 if (0) { 3708 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3709 for (unsigned int i = 0; i < p->nr_insn; i++) { 3710 printf("0x%08x 0x%08x 0x%08x 0x%08x ", 3711 ((uint32_t *)&p->store[i])[3], 3712 ((uint32_t *)&p->store[i])[2], 3713 ((uint32_t *)&p->store[i])[1], 3714 ((uint32_t *)&p->store[i])[0]); 3715 brw_disasm(stdout, &p->store[i], intel->gen); 3716 } 3717 } 3718 } 3719} 3720 3721GLboolean 3722brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c) 3723{ 3724 struct intel_context *intel = &brw->intel; 3725 struct gl_context *ctx = &intel->ctx; 3726 struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram; 3727 3728 if (!prog) 3729 return GL_FALSE; 3730 3731 struct brw_shader *shader = 3732 (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; 3733 if (!shader) 3734 return GL_FALSE; 3735 3736 /* We always use 8-wide mode, at least for now. For one, flow 3737 * control only works in 8-wide. Also, when we're fragment shader 3738 * bound, we're almost always under register pressure as well, so 3739 * 8-wide would save us from the performance cliff of spilling 3740 * regs. 3741 */ 3742 c->dispatch_width = 8; 3743 3744 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3745 printf("GLSL IR for native fragment shader %d:\n", prog->Name); 3746 _mesa_print_ir(shader->ir, NULL); 3747 printf("\n"); 3748 } 3749 3750 /* Now the main event: Visit the shader IR and generate our FS IR for it. 3751 */ 3752 fs_visitor v(c, shader); 3753 3754 if (0) { 3755 v.emit_dummy_fs(); 3756 } else { 3757 v.calculate_urb_setup(); 3758 if (intel->gen < 6) 3759 v.emit_interpolation_setup_gen4(); 3760 else 3761 v.emit_interpolation_setup_gen6(); 3762 3763 /* Generate FS IR for main(). (the visitor only descends into 3764 * functions called "main"). 3765 */ 3766 foreach_iter(exec_list_iterator, iter, *shader->ir) { 3767 ir_instruction *ir = (ir_instruction *)iter.get(); 3768 v.base_ir = ir; 3769 ir->accept(&v); 3770 } 3771 3772 v.emit_fb_writes(); 3773 3774 v.split_virtual_grfs(); 3775 3776 v.setup_paramvalues_refs(); 3777 v.setup_pull_constants(); 3778 3779 bool progress; 3780 do { 3781 progress = false; 3782 3783 progress = v.remove_duplicate_mrf_writes() || progress; 3784 3785 progress = v.propagate_constants() || progress; 3786 progress = v.register_coalesce() || progress; 3787 progress = v.compute_to_mrf() || progress; 3788 progress = v.dead_code_eliminate() || progress; 3789 } while (progress); 3790 3791 v.schedule_instructions(); 3792 3793 v.assign_curb_setup(); 3794 v.assign_urb_setup(); 3795 3796 if (0) { 3797 /* Debug of register spilling: Go spill everything. */ 3798 int virtual_grf_count = v.virtual_grf_next; 3799 for (int i = 1; i < virtual_grf_count; i++) { 3800 v.spill_reg(i); 3801 } 3802 } 3803 3804 if (0) 3805 v.assign_regs_trivial(); 3806 else { 3807 while (!v.assign_regs()) { 3808 if (v.fail) 3809 break; 3810 } 3811 } 3812 } 3813 3814 if (!v.fail) 3815 v.generate_code(); 3816 3817 assert(!v.fail); /* FINISHME: Cleanly fail, tested at link time, etc. */ 3818 3819 if (v.fail) 3820 return GL_FALSE; 3821 3822 c->prog_data.total_grf = v.grf_used; 3823 3824 return GL_TRUE; 3825} 3826