brw_fs.cpp revision 252eaa765e69a70036ec33a7e1e0ffeac1aab2ff
1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Eric Anholt <eric@anholt.net> 25 * 26 */ 27 28extern "C" { 29 30#include <sys/types.h> 31 32#include "main/macros.h" 33#include "main/shaderobj.h" 34#include "main/uniforms.h" 35#include "program/prog_parameter.h" 36#include "program/prog_print.h" 37#include "program/prog_optimize.h" 38#include "program/register_allocate.h" 39#include "program/sampler.h" 40#include "program/hash_table.h" 41#include "brw_context.h" 42#include "brw_eu.h" 43#include "brw_wm.h" 44} 45#include "brw_fs.h" 46#include "../glsl/glsl_types.h" 47#include "../glsl/ir_optimization.h" 48#include "../glsl/ir_print_visitor.h" 49 50#define MAX_INSTRUCTION (1 << 30) 51static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg); 52 53struct gl_shader * 54brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type) 55{ 56 struct brw_shader *shader; 57 58 shader = rzalloc(NULL, struct brw_shader); 59 if (shader) { 60 shader->base.Type = type; 61 shader->base.Name = name; 62 _mesa_init_shader(ctx, &shader->base); 63 } 64 65 return &shader->base; 66} 67 68struct gl_shader_program * 69brw_new_shader_program(struct gl_context *ctx, GLuint name) 70{ 71 struct brw_shader_program *prog; 72 prog = rzalloc(NULL, struct brw_shader_program); 73 if (prog) { 74 prog->base.Name = name; 75 _mesa_init_shader_program(ctx, &prog->base); 76 } 77 return &prog->base; 78} 79 80GLboolean 81brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog) 82{ 83 struct brw_context *brw = brw_context(ctx); 84 struct intel_context *intel = &brw->intel; 85 86 struct brw_shader *shader = 87 (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; 88 if (shader != NULL) { 89 void *mem_ctx = ralloc_context(NULL); 90 bool progress; 91 92 if (shader->ir) 93 ralloc_free(shader->ir); 94 shader->ir = new(shader) exec_list; 95 clone_ir_list(mem_ctx, shader->ir, shader->base.ir); 96 97 do_mat_op_to_vec(shader->ir); 98 lower_instructions(shader->ir, 99 MOD_TO_FRACT | 100 DIV_TO_MUL_RCP | 101 SUB_TO_ADD_NEG | 102 EXP_TO_EXP2 | 103 LOG_TO_LOG2); 104 105 /* Pre-gen6 HW can only nest if-statements 16 deep. Beyond this, 106 * if-statements need to be flattened. 107 */ 108 if (intel->gen < 6) 109 lower_if_to_cond_assign(shader->ir, 16); 110 111 do_lower_texture_projection(shader->ir); 112 do_vec_index_to_cond_assign(shader->ir); 113 brw_do_cubemap_normalize(shader->ir); 114 lower_noise(shader->ir); 115 lower_quadop_vector(shader->ir, false); 116 lower_variable_index_to_cond_assign(shader->ir, 117 GL_TRUE, /* input */ 118 GL_TRUE, /* output */ 119 GL_TRUE, /* temp */ 120 GL_TRUE /* uniform */ 121 ); 122 123 do { 124 progress = false; 125 126 brw_do_channel_expressions(shader->ir); 127 brw_do_vector_splitting(shader->ir); 128 129 progress = do_lower_jumps(shader->ir, true, true, 130 true, /* main return */ 131 false, /* continue */ 132 false /* loops */ 133 ) || progress; 134 135 progress = do_common_optimization(shader->ir, true, 32) || progress; 136 } while (progress); 137 138 validate_ir_tree(shader->ir); 139 140 reparent_ir(shader->ir, shader->ir); 141 ralloc_free(mem_ctx); 142 } 143 144 if (!_mesa_ir_link_shader(ctx, prog)) 145 return GL_FALSE; 146 147 return GL_TRUE; 148} 149 150static int 151type_size(const struct glsl_type *type) 152{ 153 unsigned int size, i; 154 155 switch (type->base_type) { 156 case GLSL_TYPE_UINT: 157 case GLSL_TYPE_INT: 158 case GLSL_TYPE_FLOAT: 159 case GLSL_TYPE_BOOL: 160 return type->components(); 161 case GLSL_TYPE_ARRAY: 162 return type_size(type->fields.array) * type->length; 163 case GLSL_TYPE_STRUCT: 164 size = 0; 165 for (i = 0; i < type->length; i++) { 166 size += type_size(type->fields.structure[i].type); 167 } 168 return size; 169 case GLSL_TYPE_SAMPLER: 170 /* Samplers take up no register space, since they're baked in at 171 * link time. 172 */ 173 return 0; 174 default: 175 assert(!"not reached"); 176 return 0; 177 } 178} 179 180void 181fs_visitor::fail(const char *format, ...) 182{ 183 if (!failed) { 184 failed = true; 185 186 if (INTEL_DEBUG & DEBUG_WM) { 187 fprintf(stderr, "FS compile failed: "); 188 189 va_list va; 190 va_start(va, format); 191 vfprintf(stderr, format, va); 192 va_end(va); 193 } 194 } 195} 196 197/** 198 * Returns how many MRFs an FS opcode will write over. 199 * 200 * Note that this is not the 0 or 1 implied writes in an actual gen 201 * instruction -- the FS opcodes often generate MOVs in addition. 202 */ 203int 204fs_visitor::implied_mrf_writes(fs_inst *inst) 205{ 206 if (inst->mlen == 0) 207 return 0; 208 209 switch (inst->opcode) { 210 case FS_OPCODE_RCP: 211 case FS_OPCODE_RSQ: 212 case FS_OPCODE_SQRT: 213 case FS_OPCODE_EXP2: 214 case FS_OPCODE_LOG2: 215 case FS_OPCODE_SIN: 216 case FS_OPCODE_COS: 217 return 1; 218 case FS_OPCODE_POW: 219 return 2; 220 case FS_OPCODE_TEX: 221 case FS_OPCODE_TXB: 222 case FS_OPCODE_TXD: 223 case FS_OPCODE_TXL: 224 return 1; 225 case FS_OPCODE_FB_WRITE: 226 return 2; 227 case FS_OPCODE_PULL_CONSTANT_LOAD: 228 case FS_OPCODE_UNSPILL: 229 return 1; 230 case FS_OPCODE_SPILL: 231 return 2; 232 default: 233 assert(!"not reached"); 234 return inst->mlen; 235 } 236} 237 238int 239fs_visitor::virtual_grf_alloc(int size) 240{ 241 if (virtual_grf_array_size <= virtual_grf_next) { 242 if (virtual_grf_array_size == 0) 243 virtual_grf_array_size = 16; 244 else 245 virtual_grf_array_size *= 2; 246 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int, 247 virtual_grf_array_size); 248 249 /* This slot is always unused. */ 250 virtual_grf_sizes[0] = 0; 251 } 252 virtual_grf_sizes[virtual_grf_next] = size; 253 return virtual_grf_next++; 254} 255 256/** Fixed HW reg constructor. */ 257fs_reg::fs_reg(enum register_file file, int hw_reg) 258{ 259 init(); 260 this->file = file; 261 this->hw_reg = hw_reg; 262 this->type = BRW_REGISTER_TYPE_F; 263} 264 265/** Fixed HW reg constructor. */ 266fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type) 267{ 268 init(); 269 this->file = file; 270 this->hw_reg = hw_reg; 271 this->type = type; 272} 273 274int 275brw_type_for_base_type(const struct glsl_type *type) 276{ 277 switch (type->base_type) { 278 case GLSL_TYPE_FLOAT: 279 return BRW_REGISTER_TYPE_F; 280 case GLSL_TYPE_INT: 281 case GLSL_TYPE_BOOL: 282 return BRW_REGISTER_TYPE_D; 283 case GLSL_TYPE_UINT: 284 return BRW_REGISTER_TYPE_UD; 285 case GLSL_TYPE_ARRAY: 286 case GLSL_TYPE_STRUCT: 287 case GLSL_TYPE_SAMPLER: 288 /* These should be overridden with the type of the member when 289 * dereferenced into. BRW_REGISTER_TYPE_UD seems like a likely 290 * way to trip up if we don't. 291 */ 292 return BRW_REGISTER_TYPE_UD; 293 default: 294 assert(!"not reached"); 295 return BRW_REGISTER_TYPE_F; 296 } 297} 298 299/** Automatic reg constructor. */ 300fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type) 301{ 302 init(); 303 304 this->file = GRF; 305 this->reg = v->virtual_grf_alloc(type_size(type)); 306 this->reg_offset = 0; 307 this->type = brw_type_for_base_type(type); 308} 309 310fs_reg * 311fs_visitor::variable_storage(ir_variable *var) 312{ 313 return (fs_reg *)hash_table_find(this->variable_ht, var); 314} 315 316/* Our support for uniforms is piggy-backed on the struct 317 * gl_fragment_program, because that's where the values actually 318 * get stored, rather than in some global gl_shader_program uniform 319 * store. 320 */ 321int 322fs_visitor::setup_uniform_values(int loc, const glsl_type *type) 323{ 324 unsigned int offset = 0; 325 326 if (type->is_matrix()) { 327 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT, 328 type->vector_elements, 329 1); 330 331 for (unsigned int i = 0; i < type->matrix_columns; i++) { 332 offset += setup_uniform_values(loc + offset, column); 333 } 334 335 return offset; 336 } 337 338 switch (type->base_type) { 339 case GLSL_TYPE_FLOAT: 340 case GLSL_TYPE_UINT: 341 case GLSL_TYPE_INT: 342 case GLSL_TYPE_BOOL: 343 for (unsigned int i = 0; i < type->vector_elements; i++) { 344 unsigned int param = c->prog_data.nr_params++; 345 346 assert(param < ARRAY_SIZE(c->prog_data.param)); 347 348 switch (type->base_type) { 349 case GLSL_TYPE_FLOAT: 350 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 351 break; 352 case GLSL_TYPE_UINT: 353 c->prog_data.param_convert[param] = PARAM_CONVERT_F2U; 354 break; 355 case GLSL_TYPE_INT: 356 c->prog_data.param_convert[param] = PARAM_CONVERT_F2I; 357 break; 358 case GLSL_TYPE_BOOL: 359 c->prog_data.param_convert[param] = PARAM_CONVERT_F2B; 360 break; 361 default: 362 assert(!"not reached"); 363 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 364 break; 365 } 366 this->param_index[param] = loc; 367 this->param_offset[param] = i; 368 } 369 return 1; 370 371 case GLSL_TYPE_STRUCT: 372 for (unsigned int i = 0; i < type->length; i++) { 373 offset += setup_uniform_values(loc + offset, 374 type->fields.structure[i].type); 375 } 376 return offset; 377 378 case GLSL_TYPE_ARRAY: 379 for (unsigned int i = 0; i < type->length; i++) { 380 offset += setup_uniform_values(loc + offset, type->fields.array); 381 } 382 return offset; 383 384 case GLSL_TYPE_SAMPLER: 385 /* The sampler takes up a slot, but we don't use any values from it. */ 386 return 1; 387 388 default: 389 assert(!"not reached"); 390 return 0; 391 } 392} 393 394 395/* Our support for builtin uniforms is even scarier than non-builtin. 396 * It sits on top of the PROG_STATE_VAR parameters that are 397 * automatically updated from GL context state. 398 */ 399void 400fs_visitor::setup_builtin_uniform_values(ir_variable *ir) 401{ 402 const struct gl_builtin_uniform_desc *statevar = NULL; 403 404 for (unsigned int i = 0; _mesa_builtin_uniform_desc[i].name; i++) { 405 statevar = &_mesa_builtin_uniform_desc[i]; 406 if (strcmp(ir->name, _mesa_builtin_uniform_desc[i].name) == 0) 407 break; 408 } 409 410 if (!statevar->name) { 411 fail("Failed to find builtin uniform `%s'\n", ir->name); 412 return; 413 } 414 415 int array_count; 416 if (ir->type->is_array()) { 417 array_count = ir->type->length; 418 } else { 419 array_count = 1; 420 } 421 422 for (int a = 0; a < array_count; a++) { 423 for (unsigned int i = 0; i < statevar->num_elements; i++) { 424 struct gl_builtin_uniform_element *element = &statevar->elements[i]; 425 int tokens[STATE_LENGTH]; 426 427 memcpy(tokens, element->tokens, sizeof(element->tokens)); 428 if (ir->type->is_array()) { 429 tokens[1] = a; 430 } 431 432 /* This state reference has already been setup by ir_to_mesa, 433 * but we'll get the same index back here. 434 */ 435 int index = _mesa_add_state_reference(this->fp->Base.Parameters, 436 (gl_state_index *)tokens); 437 438 /* Add each of the unique swizzles of the element as a 439 * parameter. This'll end up matching the expected layout of 440 * the array/matrix/structure we're trying to fill in. 441 */ 442 int last_swiz = -1; 443 for (unsigned int j = 0; j < 4; j++) { 444 int swiz = GET_SWZ(element->swizzle, j); 445 if (swiz == last_swiz) 446 break; 447 last_swiz = swiz; 448 449 c->prog_data.param_convert[c->prog_data.nr_params] = 450 PARAM_NO_CONVERT; 451 this->param_index[c->prog_data.nr_params] = index; 452 this->param_offset[c->prog_data.nr_params] = swiz; 453 c->prog_data.nr_params++; 454 } 455 } 456 } 457} 458 459fs_reg * 460fs_visitor::emit_fragcoord_interpolation(ir_variable *ir) 461{ 462 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 463 fs_reg wpos = *reg; 464 fs_reg neg_y = this->pixel_y; 465 neg_y.negate = true; 466 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo; 467 468 /* gl_FragCoord.x */ 469 if (ir->pixel_center_integer) { 470 emit(BRW_OPCODE_MOV, wpos, this->pixel_x); 471 } else { 472 emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f)); 473 } 474 wpos.reg_offset++; 475 476 /* gl_FragCoord.y */ 477 if (!flip && ir->pixel_center_integer) { 478 emit(BRW_OPCODE_MOV, wpos, this->pixel_y); 479 } else { 480 fs_reg pixel_y = this->pixel_y; 481 float offset = (ir->pixel_center_integer ? 0.0 : 0.5); 482 483 if (flip) { 484 pixel_y.negate = true; 485 offset += c->key.drawable_height - 1.0; 486 } 487 488 emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset)); 489 } 490 wpos.reg_offset++; 491 492 /* gl_FragCoord.z */ 493 if (intel->gen >= 6) { 494 emit(BRW_OPCODE_MOV, wpos, 495 fs_reg(brw_vec8_grf(c->source_depth_reg, 0))); 496 } else { 497 emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y, 498 interp_reg(FRAG_ATTRIB_WPOS, 2)); 499 } 500 wpos.reg_offset++; 501 502 /* gl_FragCoord.w: Already set up in emit_interpolation */ 503 emit(BRW_OPCODE_MOV, wpos, this->wpos_w); 504 505 return reg; 506} 507 508fs_reg * 509fs_visitor::emit_general_interpolation(ir_variable *ir) 510{ 511 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 512 /* Interpolation is always in floating point regs. */ 513 reg->type = BRW_REGISTER_TYPE_F; 514 fs_reg attr = *reg; 515 516 unsigned int array_elements; 517 const glsl_type *type; 518 519 if (ir->type->is_array()) { 520 array_elements = ir->type->length; 521 if (array_elements == 0) { 522 fail("dereferenced array '%s' has length 0\n", ir->name); 523 } 524 type = ir->type->fields.array; 525 } else { 526 array_elements = 1; 527 type = ir->type; 528 } 529 530 int location = ir->location; 531 for (unsigned int i = 0; i < array_elements; i++) { 532 for (unsigned int j = 0; j < type->matrix_columns; j++) { 533 if (urb_setup[location] == -1) { 534 /* If there's no incoming setup data for this slot, don't 535 * emit interpolation for it. 536 */ 537 attr.reg_offset += type->vector_elements; 538 location++; 539 continue; 540 } 541 542 bool is_gl_Color = 543 location == FRAG_ATTRIB_COL0 || location == FRAG_ATTRIB_COL1; 544 545 if (c->key.flat_shade && is_gl_Color) { 546 /* Constant interpolation (flat shading) case. The SF has 547 * handed us defined values in only the constant offset 548 * field of the setup reg. 549 */ 550 for (unsigned int k = 0; k < type->vector_elements; k++) { 551 struct brw_reg interp = interp_reg(location, k); 552 interp = suboffset(interp, 3); 553 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp)); 554 attr.reg_offset++; 555 } 556 } else { 557 /* Perspective interpolation case. */ 558 for (unsigned int k = 0; k < type->vector_elements; k++) { 559 struct brw_reg interp = interp_reg(location, k); 560 emit(FS_OPCODE_LINTERP, attr, 561 this->delta_x, this->delta_y, fs_reg(interp)); 562 attr.reg_offset++; 563 } 564 565 if (intel->gen < 6 && !(is_gl_Color && c->key.linear_color)) { 566 attr.reg_offset -= type->vector_elements; 567 for (unsigned int k = 0; k < type->vector_elements; k++) { 568 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w); 569 attr.reg_offset++; 570 } 571 } 572 } 573 location++; 574 } 575 } 576 577 return reg; 578} 579 580fs_reg * 581fs_visitor::emit_frontfacing_interpolation(ir_variable *ir) 582{ 583 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 584 585 /* The frontfacing comes in as a bit in the thread payload. */ 586 if (intel->gen >= 6) { 587 emit(BRW_OPCODE_ASR, *reg, 588 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)), 589 fs_reg(15)); 590 emit(BRW_OPCODE_NOT, *reg, *reg); 591 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1)); 592 } else { 593 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); 594 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives 595 * us front face 596 */ 597 fs_inst *inst = emit(BRW_OPCODE_CMP, *reg, 598 fs_reg(r1_6ud), 599 fs_reg(1u << 31)); 600 inst->conditional_mod = BRW_CONDITIONAL_L; 601 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u)); 602 } 603 604 return reg; 605} 606 607fs_inst * 608fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src) 609{ 610 switch (opcode) { 611 case FS_OPCODE_RCP: 612 case FS_OPCODE_RSQ: 613 case FS_OPCODE_SQRT: 614 case FS_OPCODE_EXP2: 615 case FS_OPCODE_LOG2: 616 case FS_OPCODE_SIN: 617 case FS_OPCODE_COS: 618 break; 619 default: 620 assert(!"not reached: bad math opcode"); 621 return NULL; 622 } 623 624 /* Can't do hstride == 0 args to gen6 math, so expand it out. We 625 * might be able to do better by doing execsize = 1 math and then 626 * expanding that result out, but we would need to be careful with 627 * masking. 628 * 629 * The hardware ignores source modifiers (negate and abs) on math 630 * instructions, so we also move to a temp to set those up. 631 */ 632 if (intel->gen >= 6 && (src.file == UNIFORM || 633 src.abs || 634 src.negate)) { 635 fs_reg expanded = fs_reg(this, glsl_type::float_type); 636 emit(BRW_OPCODE_MOV, expanded, src); 637 src = expanded; 638 } 639 640 fs_inst *inst = emit(opcode, dst, src); 641 642 if (intel->gen < 6) { 643 inst->base_mrf = 2; 644 inst->mlen = 1; 645 } 646 647 return inst; 648} 649 650fs_inst * 651fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1) 652{ 653 int base_mrf = 2; 654 fs_inst *inst; 655 656 assert(opcode == FS_OPCODE_POW); 657 658 if (intel->gen >= 6) { 659 /* Can't do hstride == 0 args to gen6 math, so expand it out. 660 * 661 * The hardware ignores source modifiers (negate and abs) on math 662 * instructions, so we also move to a temp to set those up. 663 */ 664 if (src0.file == UNIFORM || src0.abs || src0.negate) { 665 fs_reg expanded = fs_reg(this, glsl_type::float_type); 666 emit(BRW_OPCODE_MOV, expanded, src0); 667 src0 = expanded; 668 } 669 670 if (src1.file == UNIFORM || src1.abs || src1.negate) { 671 fs_reg expanded = fs_reg(this, glsl_type::float_type); 672 emit(BRW_OPCODE_MOV, expanded, src1); 673 src1 = expanded; 674 } 675 676 inst = emit(opcode, dst, src0, src1); 677 } else { 678 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1); 679 inst = emit(opcode, dst, src0, reg_null_f); 680 681 inst->base_mrf = base_mrf; 682 inst->mlen = 2; 683 } 684 return inst; 685} 686 687void 688fs_visitor::visit(ir_variable *ir) 689{ 690 fs_reg *reg = NULL; 691 692 if (variable_storage(ir)) 693 return; 694 695 if (strcmp(ir->name, "gl_FragColor") == 0) { 696 this->frag_color = ir; 697 } else if (strcmp(ir->name, "gl_FragData") == 0) { 698 this->frag_data = ir; 699 } else if (strcmp(ir->name, "gl_FragDepth") == 0) { 700 this->frag_depth = ir; 701 } 702 703 if (ir->mode == ir_var_in) { 704 if (!strcmp(ir->name, "gl_FragCoord")) { 705 reg = emit_fragcoord_interpolation(ir); 706 } else if (!strcmp(ir->name, "gl_FrontFacing")) { 707 reg = emit_frontfacing_interpolation(ir); 708 } else { 709 reg = emit_general_interpolation(ir); 710 } 711 assert(reg); 712 hash_table_insert(this->variable_ht, reg, ir); 713 return; 714 } 715 716 if (ir->mode == ir_var_uniform) { 717 int param_index = c->prog_data.nr_params; 718 719 if (!strncmp(ir->name, "gl_", 3)) { 720 setup_builtin_uniform_values(ir); 721 } else { 722 setup_uniform_values(ir->location, ir->type); 723 } 724 725 reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index); 726 reg->type = brw_type_for_base_type(ir->type); 727 } 728 729 if (!reg) 730 reg = new(this->mem_ctx) fs_reg(this, ir->type); 731 732 hash_table_insert(this->variable_ht, reg, ir); 733} 734 735void 736fs_visitor::visit(ir_dereference_variable *ir) 737{ 738 fs_reg *reg = variable_storage(ir->var); 739 this->result = *reg; 740} 741 742void 743fs_visitor::visit(ir_dereference_record *ir) 744{ 745 const glsl_type *struct_type = ir->record->type; 746 747 ir->record->accept(this); 748 749 unsigned int offset = 0; 750 for (unsigned int i = 0; i < struct_type->length; i++) { 751 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0) 752 break; 753 offset += type_size(struct_type->fields.structure[i].type); 754 } 755 this->result.reg_offset += offset; 756 this->result.type = brw_type_for_base_type(ir->type); 757} 758 759void 760fs_visitor::visit(ir_dereference_array *ir) 761{ 762 ir_constant *index; 763 int element_size; 764 765 ir->array->accept(this); 766 index = ir->array_index->as_constant(); 767 768 element_size = type_size(ir->type); 769 this->result.type = brw_type_for_base_type(ir->type); 770 771 if (index) { 772 assert(this->result.file == UNIFORM || 773 (this->result.file == GRF && 774 this->result.reg != 0)); 775 this->result.reg_offset += index->value.i[0] * element_size; 776 } else { 777 assert(!"FINISHME: non-constant array element"); 778 } 779} 780 781/* Instruction selection: Produce a MOV.sat instead of 782 * MIN(MAX(val, 0), 1) when possible. 783 */ 784bool 785fs_visitor::try_emit_saturate(ir_expression *ir) 786{ 787 ir_rvalue *sat_val = ir->as_rvalue_to_saturate(); 788 789 if (!sat_val) 790 return false; 791 792 sat_val->accept(this); 793 fs_reg src = this->result; 794 795 this->result = fs_reg(this, ir->type); 796 fs_inst *inst = emit(BRW_OPCODE_MOV, this->result, src); 797 inst->saturate = true; 798 799 return true; 800} 801 802static uint32_t 803brw_conditional_for_comparison(unsigned int op) 804{ 805 switch (op) { 806 case ir_binop_less: 807 return BRW_CONDITIONAL_L; 808 case ir_binop_greater: 809 return BRW_CONDITIONAL_G; 810 case ir_binop_lequal: 811 return BRW_CONDITIONAL_LE; 812 case ir_binop_gequal: 813 return BRW_CONDITIONAL_GE; 814 case ir_binop_equal: 815 case ir_binop_all_equal: /* same as equal for scalars */ 816 return BRW_CONDITIONAL_Z; 817 case ir_binop_nequal: 818 case ir_binop_any_nequal: /* same as nequal for scalars */ 819 return BRW_CONDITIONAL_NZ; 820 default: 821 assert(!"not reached: bad operation for comparison"); 822 return BRW_CONDITIONAL_NZ; 823 } 824} 825 826void 827fs_visitor::visit(ir_expression *ir) 828{ 829 unsigned int operand; 830 fs_reg op[2], temp; 831 fs_inst *inst; 832 833 assert(ir->get_num_operands() <= 2); 834 835 if (try_emit_saturate(ir)) 836 return; 837 838 for (operand = 0; operand < ir->get_num_operands(); operand++) { 839 ir->operands[operand]->accept(this); 840 if (this->result.file == BAD_FILE) { 841 ir_print_visitor v; 842 fail("Failed to get tree for expression operand:\n"); 843 ir->operands[operand]->accept(&v); 844 } 845 op[operand] = this->result; 846 847 /* Matrix expression operands should have been broken down to vector 848 * operations already. 849 */ 850 assert(!ir->operands[operand]->type->is_matrix()); 851 /* And then those vector operands should have been broken down to scalar. 852 */ 853 assert(!ir->operands[operand]->type->is_vector()); 854 } 855 856 /* Storage for our result. If our result goes into an assignment, it will 857 * just get copy-propagated out, so no worries. 858 */ 859 this->result = fs_reg(this, ir->type); 860 861 switch (ir->operation) { 862 case ir_unop_logic_not: 863 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is 864 * ones complement of the whole register, not just bit 0. 865 */ 866 emit(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1)); 867 break; 868 case ir_unop_neg: 869 op[0].negate = !op[0].negate; 870 this->result = op[0]; 871 break; 872 case ir_unop_abs: 873 op[0].abs = true; 874 op[0].negate = false; 875 this->result = op[0]; 876 break; 877 case ir_unop_sign: 878 temp = fs_reg(this, ir->type); 879 880 emit(BRW_OPCODE_MOV, this->result, fs_reg(0.0f)); 881 882 inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f)); 883 inst->conditional_mod = BRW_CONDITIONAL_G; 884 inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(1.0f)); 885 inst->predicated = true; 886 887 inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f)); 888 inst->conditional_mod = BRW_CONDITIONAL_L; 889 inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f)); 890 inst->predicated = true; 891 892 break; 893 case ir_unop_rcp: 894 emit_math(FS_OPCODE_RCP, this->result, op[0]); 895 break; 896 897 case ir_unop_exp2: 898 emit_math(FS_OPCODE_EXP2, this->result, op[0]); 899 break; 900 case ir_unop_log2: 901 emit_math(FS_OPCODE_LOG2, this->result, op[0]); 902 break; 903 case ir_unop_exp: 904 case ir_unop_log: 905 assert(!"not reached: should be handled by ir_explog_to_explog2"); 906 break; 907 case ir_unop_sin: 908 case ir_unop_sin_reduced: 909 emit_math(FS_OPCODE_SIN, this->result, op[0]); 910 break; 911 case ir_unop_cos: 912 case ir_unop_cos_reduced: 913 emit_math(FS_OPCODE_COS, this->result, op[0]); 914 break; 915 916 case ir_unop_dFdx: 917 emit(FS_OPCODE_DDX, this->result, op[0]); 918 break; 919 case ir_unop_dFdy: 920 emit(FS_OPCODE_DDY, this->result, op[0]); 921 break; 922 923 case ir_binop_add: 924 emit(BRW_OPCODE_ADD, this->result, op[0], op[1]); 925 break; 926 case ir_binop_sub: 927 assert(!"not reached: should be handled by ir_sub_to_add_neg"); 928 break; 929 930 case ir_binop_mul: 931 emit(BRW_OPCODE_MUL, this->result, op[0], op[1]); 932 break; 933 case ir_binop_div: 934 assert(!"not reached: should be handled by ir_div_to_mul_rcp"); 935 break; 936 case ir_binop_mod: 937 assert(!"ir_binop_mod should have been converted to b * fract(a/b)"); 938 break; 939 940 case ir_binop_less: 941 case ir_binop_greater: 942 case ir_binop_lequal: 943 case ir_binop_gequal: 944 case ir_binop_equal: 945 case ir_binop_all_equal: 946 case ir_binop_nequal: 947 case ir_binop_any_nequal: 948 temp = this->result; 949 /* original gen4 does implicit conversion before comparison. */ 950 if (intel->gen < 5) 951 temp.type = op[0].type; 952 953 inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]); 954 inst->conditional_mod = brw_conditional_for_comparison(ir->operation); 955 emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)); 956 break; 957 958 case ir_binop_logic_xor: 959 emit(BRW_OPCODE_XOR, this->result, op[0], op[1]); 960 break; 961 962 case ir_binop_logic_or: 963 emit(BRW_OPCODE_OR, this->result, op[0], op[1]); 964 break; 965 966 case ir_binop_logic_and: 967 emit(BRW_OPCODE_AND, this->result, op[0], op[1]); 968 break; 969 970 case ir_binop_dot: 971 case ir_unop_any: 972 assert(!"not reached: should be handled by brw_fs_channel_expressions"); 973 break; 974 975 case ir_unop_noise: 976 assert(!"not reached: should be handled by lower_noise"); 977 break; 978 979 case ir_quadop_vector: 980 assert(!"not reached: should be handled by lower_quadop_vector"); 981 break; 982 983 case ir_unop_sqrt: 984 emit_math(FS_OPCODE_SQRT, this->result, op[0]); 985 break; 986 987 case ir_unop_rsq: 988 emit_math(FS_OPCODE_RSQ, this->result, op[0]); 989 break; 990 991 case ir_unop_i2f: 992 case ir_unop_b2f: 993 case ir_unop_b2i: 994 case ir_unop_f2i: 995 emit(BRW_OPCODE_MOV, this->result, op[0]); 996 break; 997 case ir_unop_f2b: 998 case ir_unop_i2b: 999 temp = this->result; 1000 /* original gen4 does implicit conversion before comparison. */ 1001 if (intel->gen < 5) 1002 temp.type = op[0].type; 1003 1004 inst = emit(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f)); 1005 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1006 inst = emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1)); 1007 break; 1008 1009 case ir_unop_trunc: 1010 emit(BRW_OPCODE_RNDZ, this->result, op[0]); 1011 break; 1012 case ir_unop_ceil: 1013 op[0].negate = !op[0].negate; 1014 inst = emit(BRW_OPCODE_RNDD, this->result, op[0]); 1015 this->result.negate = true; 1016 break; 1017 case ir_unop_floor: 1018 inst = emit(BRW_OPCODE_RNDD, this->result, op[0]); 1019 break; 1020 case ir_unop_fract: 1021 inst = emit(BRW_OPCODE_FRC, this->result, op[0]); 1022 break; 1023 case ir_unop_round_even: 1024 emit(BRW_OPCODE_RNDE, this->result, op[0]); 1025 break; 1026 1027 case ir_binop_min: 1028 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]); 1029 inst->conditional_mod = BRW_CONDITIONAL_L; 1030 1031 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]); 1032 inst->predicated = true; 1033 break; 1034 case ir_binop_max: 1035 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]); 1036 inst->conditional_mod = BRW_CONDITIONAL_G; 1037 1038 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]); 1039 inst->predicated = true; 1040 break; 1041 1042 case ir_binop_pow: 1043 emit_math(FS_OPCODE_POW, this->result, op[0], op[1]); 1044 break; 1045 1046 case ir_unop_bit_not: 1047 inst = emit(BRW_OPCODE_NOT, this->result, op[0]); 1048 break; 1049 case ir_binop_bit_and: 1050 inst = emit(BRW_OPCODE_AND, this->result, op[0], op[1]); 1051 break; 1052 case ir_binop_bit_xor: 1053 inst = emit(BRW_OPCODE_XOR, this->result, op[0], op[1]); 1054 break; 1055 case ir_binop_bit_or: 1056 inst = emit(BRW_OPCODE_OR, this->result, op[0], op[1]); 1057 break; 1058 1059 case ir_unop_u2f: 1060 case ir_binop_lshift: 1061 case ir_binop_rshift: 1062 assert(!"GLSL 1.30 features unsupported"); 1063 break; 1064 } 1065} 1066 1067void 1068fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r, 1069 const glsl_type *type, bool predicated) 1070{ 1071 switch (type->base_type) { 1072 case GLSL_TYPE_FLOAT: 1073 case GLSL_TYPE_UINT: 1074 case GLSL_TYPE_INT: 1075 case GLSL_TYPE_BOOL: 1076 for (unsigned int i = 0; i < type->components(); i++) { 1077 l.type = brw_type_for_base_type(type); 1078 r.type = brw_type_for_base_type(type); 1079 1080 fs_inst *inst = emit(BRW_OPCODE_MOV, l, r); 1081 inst->predicated = predicated; 1082 1083 l.reg_offset++; 1084 r.reg_offset++; 1085 } 1086 break; 1087 case GLSL_TYPE_ARRAY: 1088 for (unsigned int i = 0; i < type->length; i++) { 1089 emit_assignment_writes(l, r, type->fields.array, predicated); 1090 } 1091 break; 1092 1093 case GLSL_TYPE_STRUCT: 1094 for (unsigned int i = 0; i < type->length; i++) { 1095 emit_assignment_writes(l, r, type->fields.structure[i].type, 1096 predicated); 1097 } 1098 break; 1099 1100 case GLSL_TYPE_SAMPLER: 1101 break; 1102 1103 default: 1104 assert(!"not reached"); 1105 break; 1106 } 1107} 1108 1109void 1110fs_visitor::visit(ir_assignment *ir) 1111{ 1112 struct fs_reg l, r; 1113 fs_inst *inst; 1114 1115 /* FINISHME: arrays on the lhs */ 1116 ir->lhs->accept(this); 1117 l = this->result; 1118 1119 ir->rhs->accept(this); 1120 r = this->result; 1121 1122 assert(l.file != BAD_FILE); 1123 assert(r.file != BAD_FILE); 1124 1125 if (ir->condition) { 1126 emit_bool_to_cond_code(ir->condition); 1127 } 1128 1129 if (ir->lhs->type->is_scalar() || 1130 ir->lhs->type->is_vector()) { 1131 for (int i = 0; i < ir->lhs->type->vector_elements; i++) { 1132 if (ir->write_mask & (1 << i)) { 1133 inst = emit(BRW_OPCODE_MOV, l, r); 1134 if (ir->condition) 1135 inst->predicated = true; 1136 r.reg_offset++; 1137 } 1138 l.reg_offset++; 1139 } 1140 } else { 1141 emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL); 1142 } 1143} 1144 1145fs_inst * 1146fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate) 1147{ 1148 int mlen; 1149 int base_mrf = 1; 1150 bool simd16 = false; 1151 fs_reg orig_dst; 1152 1153 /* g0 header. */ 1154 mlen = 1; 1155 1156 if (ir->shadow_comparitor) { 1157 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1158 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate); 1159 coordinate.reg_offset++; 1160 } 1161 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */ 1162 mlen += 3; 1163 1164 if (ir->op == ir_tex) { 1165 /* There's no plain shadow compare message, so we use shadow 1166 * compare with a bias of 0.0. 1167 */ 1168 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)); 1169 mlen++; 1170 } else if (ir->op == ir_txb) { 1171 ir->lod_info.bias->accept(this); 1172 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1173 mlen++; 1174 } else { 1175 assert(ir->op == ir_txl); 1176 ir->lod_info.lod->accept(this); 1177 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1178 mlen++; 1179 } 1180 1181 ir->shadow_comparitor->accept(this); 1182 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1183 mlen++; 1184 } else if (ir->op == ir_tex) { 1185 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1186 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate); 1187 coordinate.reg_offset++; 1188 } 1189 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */ 1190 mlen += 3; 1191 } else if (ir->op == ir_txd) { 1192 assert(!"TXD isn't supported on gen4 yet."); 1193 } else { 1194 /* Oh joy. gen4 doesn't have SIMD8 non-shadow-compare bias/lod 1195 * instructions. We'll need to do SIMD16 here. 1196 */ 1197 assert(ir->op == ir_txb || ir->op == ir_txl); 1198 1199 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1200 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2), coordinate); 1201 coordinate.reg_offset++; 1202 } 1203 1204 /* lod/bias appears after u/v/r. */ 1205 mlen += 6; 1206 1207 if (ir->op == ir_txb) { 1208 ir->lod_info.bias->accept(this); 1209 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1210 mlen++; 1211 } else { 1212 ir->lod_info.lod->accept(this); 1213 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1214 mlen++; 1215 } 1216 1217 /* The unused upper half. */ 1218 mlen++; 1219 1220 /* Now, since we're doing simd16, the return is 2 interleaved 1221 * vec4s where the odd-indexed ones are junk. We'll need to move 1222 * this weirdness around to the expected layout. 1223 */ 1224 simd16 = true; 1225 orig_dst = dst; 1226 dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 1227 2)); 1228 dst.type = BRW_REGISTER_TYPE_F; 1229 } 1230 1231 fs_inst *inst = NULL; 1232 switch (ir->op) { 1233 case ir_tex: 1234 inst = emit(FS_OPCODE_TEX, dst); 1235 break; 1236 case ir_txb: 1237 inst = emit(FS_OPCODE_TXB, dst); 1238 break; 1239 case ir_txl: 1240 inst = emit(FS_OPCODE_TXL, dst); 1241 break; 1242 case ir_txd: 1243 inst = emit(FS_OPCODE_TXD, dst); 1244 break; 1245 case ir_txf: 1246 assert(!"GLSL 1.30 features unsupported"); 1247 break; 1248 } 1249 inst->base_mrf = base_mrf; 1250 inst->mlen = mlen; 1251 1252 if (simd16) { 1253 for (int i = 0; i < 4; i++) { 1254 emit(BRW_OPCODE_MOV, orig_dst, dst); 1255 orig_dst.reg_offset++; 1256 dst.reg_offset += 2; 1257 } 1258 } 1259 1260 return inst; 1261} 1262 1263fs_inst * 1264fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate) 1265{ 1266 /* gen5's SIMD8 sampler has slots for u, v, r, array index, then 1267 * optional parameters like shadow comparitor or LOD bias. If 1268 * optional parameters aren't present, those base slots are 1269 * optional and don't need to be included in the message. 1270 * 1271 * We don't fill in the unnecessary slots regardless, which may 1272 * look surprising in the disassembly. 1273 */ 1274 int mlen = 1; /* g0 header always present. */ 1275 int base_mrf = 1; 1276 1277 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1278 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate); 1279 coordinate.reg_offset++; 1280 } 1281 mlen += ir->coordinate->type->vector_elements; 1282 1283 if (ir->shadow_comparitor) { 1284 mlen = MAX2(mlen, 5); 1285 1286 ir->shadow_comparitor->accept(this); 1287 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1288 mlen++; 1289 } 1290 1291 fs_inst *inst = NULL; 1292 switch (ir->op) { 1293 case ir_tex: 1294 inst = emit(FS_OPCODE_TEX, dst); 1295 break; 1296 case ir_txb: 1297 ir->lod_info.bias->accept(this); 1298 mlen = MAX2(mlen, 5); 1299 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1300 mlen++; 1301 1302 inst = emit(FS_OPCODE_TXB, dst); 1303 break; 1304 case ir_txl: 1305 ir->lod_info.lod->accept(this); 1306 mlen = MAX2(mlen, 5); 1307 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1308 mlen++; 1309 1310 inst = emit(FS_OPCODE_TXL, dst); 1311 break; 1312 case ir_txd: 1313 case ir_txf: 1314 assert(!"GLSL 1.30 features unsupported"); 1315 break; 1316 } 1317 inst->base_mrf = base_mrf; 1318 inst->mlen = mlen; 1319 1320 return inst; 1321} 1322 1323void 1324fs_visitor::visit(ir_texture *ir) 1325{ 1326 int sampler; 1327 fs_inst *inst = NULL; 1328 1329 ir->coordinate->accept(this); 1330 fs_reg coordinate = this->result; 1331 1332 if (ir->offset != NULL) { 1333 ir_constant *offset = ir->offset->as_constant(); 1334 assert(offset != NULL); 1335 1336 signed char offsets[3]; 1337 for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) 1338 offsets[i] = (signed char) offset->value.i[i]; 1339 1340 /* Combine all three offsets into a single unsigned dword: 1341 * 1342 * bits 11:8 - U Offset (X component) 1343 * bits 7:4 - V Offset (Y component) 1344 * bits 3:0 - R Offset (Z component) 1345 */ 1346 unsigned offset_bits = 0; 1347 for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) { 1348 const unsigned shift = 4 * (2 - i); 1349 offset_bits |= (offsets[i] << shift) & (0xF << shift); 1350 } 1351 1352 /* Explicitly set up the message header by copying g0 to msg reg m1. */ 1353 emit(BRW_OPCODE_MOV, fs_reg(MRF, 1, BRW_REGISTER_TYPE_UD), 1354 fs_reg(GRF, 0, BRW_REGISTER_TYPE_UD)); 1355 1356 /* Then set the offset bits in DWord 2 of the message header. */ 1357 emit(BRW_OPCODE_MOV, 1358 fs_reg(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 1, 2), 1359 BRW_REGISTER_TYPE_UD)), 1360 fs_reg(brw_imm_uw(offset_bits))); 1361 } 1362 1363 /* Should be lowered by do_lower_texture_projection */ 1364 assert(!ir->projector); 1365 1366 sampler = _mesa_get_sampler_uniform_value(ir->sampler, 1367 ctx->Shader.CurrentFragmentProgram, 1368 &brw->fragment_program->Base); 1369 sampler = c->fp->program.Base.SamplerUnits[sampler]; 1370 1371 /* The 965 requires the EU to do the normalization of GL rectangle 1372 * texture coordinates. We use the program parameter state 1373 * tracking to get the scaling factor. 1374 */ 1375 if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) { 1376 struct gl_program_parameter_list *params = c->fp->program.Base.Parameters; 1377 int tokens[STATE_LENGTH] = { 1378 STATE_INTERNAL, 1379 STATE_TEXRECT_SCALE, 1380 sampler, 1381 0, 1382 0 1383 }; 1384 1385 c->prog_data.param_convert[c->prog_data.nr_params] = 1386 PARAM_NO_CONVERT; 1387 c->prog_data.param_convert[c->prog_data.nr_params + 1] = 1388 PARAM_NO_CONVERT; 1389 1390 fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params); 1391 fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1); 1392 GLuint index = _mesa_add_state_reference(params, 1393 (gl_state_index *)tokens); 1394 1395 this->param_index[c->prog_data.nr_params] = index; 1396 this->param_offset[c->prog_data.nr_params] = 0; 1397 c->prog_data.nr_params++; 1398 this->param_index[c->prog_data.nr_params] = index; 1399 this->param_offset[c->prog_data.nr_params] = 1; 1400 c->prog_data.nr_params++; 1401 1402 fs_reg dst = fs_reg(this, ir->coordinate->type); 1403 fs_reg src = coordinate; 1404 coordinate = dst; 1405 1406 emit(BRW_OPCODE_MUL, dst, src, scale_x); 1407 dst.reg_offset++; 1408 src.reg_offset++; 1409 emit(BRW_OPCODE_MUL, dst, src, scale_y); 1410 } 1411 1412 /* Writemasking doesn't eliminate channels on SIMD8 texture 1413 * samples, so don't worry about them. 1414 */ 1415 fs_reg dst = fs_reg(this, glsl_type::vec4_type); 1416 1417 if (intel->gen < 5) { 1418 inst = emit_texture_gen4(ir, dst, coordinate); 1419 } else { 1420 inst = emit_texture_gen5(ir, dst, coordinate); 1421 } 1422 1423 /* If there's an offset, we already set up m1. To avoid the implied move, 1424 * use the null register. Otherwise, we want an implied move from g0. 1425 */ 1426 if (ir->offset != NULL) 1427 inst->src[0] = fs_reg(brw_null_reg()); 1428 else 1429 inst->src[0] = fs_reg(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW)); 1430 1431 inst->sampler = sampler; 1432 1433 this->result = dst; 1434 1435 if (ir->shadow_comparitor) 1436 inst->shadow_compare = true; 1437 1438 if (ir->type == glsl_type::float_type) { 1439 /* Ignore DEPTH_TEXTURE_MODE swizzling. */ 1440 assert(ir->sampler->type->sampler_shadow); 1441 } else if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) { 1442 fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type); 1443 1444 for (int i = 0; i < 4; i++) { 1445 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i); 1446 fs_reg l = swizzle_dst; 1447 l.reg_offset += i; 1448 1449 if (swiz == SWIZZLE_ZERO) { 1450 emit(BRW_OPCODE_MOV, l, fs_reg(0.0f)); 1451 } else if (swiz == SWIZZLE_ONE) { 1452 emit(BRW_OPCODE_MOV, l, fs_reg(1.0f)); 1453 } else { 1454 fs_reg r = dst; 1455 r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i); 1456 emit(BRW_OPCODE_MOV, l, r); 1457 } 1458 } 1459 this->result = swizzle_dst; 1460 } 1461} 1462 1463void 1464fs_visitor::visit(ir_swizzle *ir) 1465{ 1466 ir->val->accept(this); 1467 fs_reg val = this->result; 1468 1469 if (ir->type->vector_elements == 1) { 1470 this->result.reg_offset += ir->mask.x; 1471 return; 1472 } 1473 1474 fs_reg result = fs_reg(this, ir->type); 1475 this->result = result; 1476 1477 for (unsigned int i = 0; i < ir->type->vector_elements; i++) { 1478 fs_reg channel = val; 1479 int swiz = 0; 1480 1481 switch (i) { 1482 case 0: 1483 swiz = ir->mask.x; 1484 break; 1485 case 1: 1486 swiz = ir->mask.y; 1487 break; 1488 case 2: 1489 swiz = ir->mask.z; 1490 break; 1491 case 3: 1492 swiz = ir->mask.w; 1493 break; 1494 } 1495 1496 channel.reg_offset += swiz; 1497 emit(BRW_OPCODE_MOV, result, channel); 1498 result.reg_offset++; 1499 } 1500} 1501 1502void 1503fs_visitor::visit(ir_discard *ir) 1504{ 1505 fs_reg temp = fs_reg(this, glsl_type::uint_type); 1506 1507 assert(ir->condition == NULL); /* FINISHME */ 1508 1509 emit(FS_OPCODE_DISCARD_NOT, temp, reg_null_d); 1510 emit(FS_OPCODE_DISCARD_AND, reg_null_d, temp); 1511 kill_emitted = true; 1512} 1513 1514void 1515fs_visitor::visit(ir_constant *ir) 1516{ 1517 /* Set this->result to reg at the bottom of the function because some code 1518 * paths will cause this visitor to be applied to other fields. This will 1519 * cause the value stored in this->result to be modified. 1520 * 1521 * Make reg constant so that it doesn't get accidentally modified along the 1522 * way. Yes, I actually had this problem. :( 1523 */ 1524 const fs_reg reg(this, ir->type); 1525 fs_reg dst_reg = reg; 1526 1527 if (ir->type->is_array()) { 1528 const unsigned size = type_size(ir->type->fields.array); 1529 1530 for (unsigned i = 0; i < ir->type->length; i++) { 1531 ir->array_elements[i]->accept(this); 1532 fs_reg src_reg = this->result; 1533 1534 dst_reg.type = src_reg.type; 1535 for (unsigned j = 0; j < size; j++) { 1536 emit(BRW_OPCODE_MOV, dst_reg, src_reg); 1537 src_reg.reg_offset++; 1538 dst_reg.reg_offset++; 1539 } 1540 } 1541 } else if (ir->type->is_record()) { 1542 foreach_list(node, &ir->components) { 1543 ir_instruction *const field = (ir_instruction *) node; 1544 const unsigned size = type_size(field->type); 1545 1546 field->accept(this); 1547 fs_reg src_reg = this->result; 1548 1549 dst_reg.type = src_reg.type; 1550 for (unsigned j = 0; j < size; j++) { 1551 emit(BRW_OPCODE_MOV, dst_reg, src_reg); 1552 src_reg.reg_offset++; 1553 dst_reg.reg_offset++; 1554 } 1555 } 1556 } else { 1557 const unsigned size = type_size(ir->type); 1558 1559 for (unsigned i = 0; i < size; i++) { 1560 switch (ir->type->base_type) { 1561 case GLSL_TYPE_FLOAT: 1562 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i])); 1563 break; 1564 case GLSL_TYPE_UINT: 1565 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i])); 1566 break; 1567 case GLSL_TYPE_INT: 1568 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i])); 1569 break; 1570 case GLSL_TYPE_BOOL: 1571 emit(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i])); 1572 break; 1573 default: 1574 assert(!"Non-float/uint/int/bool constant"); 1575 } 1576 dst_reg.reg_offset++; 1577 } 1578 } 1579 1580 this->result = reg; 1581} 1582 1583void 1584fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir) 1585{ 1586 ir_expression *expr = ir->as_expression(); 1587 1588 if (expr) { 1589 fs_reg op[2]; 1590 fs_inst *inst; 1591 1592 assert(expr->get_num_operands() <= 2); 1593 for (unsigned int i = 0; i < expr->get_num_operands(); i++) { 1594 assert(expr->operands[i]->type->is_scalar()); 1595 1596 expr->operands[i]->accept(this); 1597 op[i] = this->result; 1598 } 1599 1600 switch (expr->operation) { 1601 case ir_unop_logic_not: 1602 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1)); 1603 inst->conditional_mod = BRW_CONDITIONAL_Z; 1604 break; 1605 1606 case ir_binop_logic_xor: 1607 inst = emit(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]); 1608 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1609 break; 1610 1611 case ir_binop_logic_or: 1612 inst = emit(BRW_OPCODE_OR, reg_null_d, op[0], op[1]); 1613 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1614 break; 1615 1616 case ir_binop_logic_and: 1617 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], op[1]); 1618 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1619 break; 1620 1621 case ir_unop_f2b: 1622 if (intel->gen >= 6) { 1623 inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f)); 1624 } else { 1625 inst = emit(BRW_OPCODE_MOV, reg_null_f, op[0]); 1626 } 1627 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1628 break; 1629 1630 case ir_unop_i2b: 1631 if (intel->gen >= 6) { 1632 inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0)); 1633 } else { 1634 inst = emit(BRW_OPCODE_MOV, reg_null_d, op[0]); 1635 } 1636 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1637 break; 1638 1639 case ir_binop_greater: 1640 case ir_binop_gequal: 1641 case ir_binop_less: 1642 case ir_binop_lequal: 1643 case ir_binop_equal: 1644 case ir_binop_all_equal: 1645 case ir_binop_nequal: 1646 case ir_binop_any_nequal: 1647 inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]); 1648 inst->conditional_mod = 1649 brw_conditional_for_comparison(expr->operation); 1650 break; 1651 1652 default: 1653 assert(!"not reached"); 1654 fail("bad cond code\n"); 1655 break; 1656 } 1657 return; 1658 } 1659 1660 ir->accept(this); 1661 1662 if (intel->gen >= 6) { 1663 fs_inst *inst = emit(BRW_OPCODE_AND, reg_null_d, this->result, fs_reg(1)); 1664 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1665 } else { 1666 fs_inst *inst = emit(BRW_OPCODE_MOV, reg_null_d, this->result); 1667 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1668 } 1669} 1670 1671/** 1672 * Emit a gen6 IF statement with the comparison folded into the IF 1673 * instruction. 1674 */ 1675void 1676fs_visitor::emit_if_gen6(ir_if *ir) 1677{ 1678 ir_expression *expr = ir->condition->as_expression(); 1679 1680 if (expr) { 1681 fs_reg op[2]; 1682 fs_inst *inst; 1683 fs_reg temp; 1684 1685 assert(expr->get_num_operands() <= 2); 1686 for (unsigned int i = 0; i < expr->get_num_operands(); i++) { 1687 assert(expr->operands[i]->type->is_scalar()); 1688 1689 expr->operands[i]->accept(this); 1690 op[i] = this->result; 1691 } 1692 1693 switch (expr->operation) { 1694 case ir_unop_logic_not: 1695 inst = emit(BRW_OPCODE_IF, temp, op[0], fs_reg(0)); 1696 inst->conditional_mod = BRW_CONDITIONAL_Z; 1697 return; 1698 1699 case ir_binop_logic_xor: 1700 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]); 1701 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1702 return; 1703 1704 case ir_binop_logic_or: 1705 temp = fs_reg(this, glsl_type::bool_type); 1706 emit(BRW_OPCODE_OR, temp, op[0], op[1]); 1707 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0)); 1708 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1709 return; 1710 1711 case ir_binop_logic_and: 1712 temp = fs_reg(this, glsl_type::bool_type); 1713 emit(BRW_OPCODE_AND, temp, op[0], op[1]); 1714 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0)); 1715 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1716 return; 1717 1718 case ir_unop_f2b: 1719 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0)); 1720 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1721 return; 1722 1723 case ir_unop_i2b: 1724 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0)); 1725 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1726 return; 1727 1728 case ir_binop_greater: 1729 case ir_binop_gequal: 1730 case ir_binop_less: 1731 case ir_binop_lequal: 1732 case ir_binop_equal: 1733 case ir_binop_all_equal: 1734 case ir_binop_nequal: 1735 case ir_binop_any_nequal: 1736 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]); 1737 inst->conditional_mod = 1738 brw_conditional_for_comparison(expr->operation); 1739 return; 1740 default: 1741 assert(!"not reached"); 1742 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0)); 1743 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1744 fail("bad condition\n"); 1745 return; 1746 } 1747 return; 1748 } 1749 1750 ir->condition->accept(this); 1751 1752 fs_inst *inst = emit(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0)); 1753 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1754} 1755 1756void 1757fs_visitor::visit(ir_if *ir) 1758{ 1759 fs_inst *inst; 1760 1761 /* Don't point the annotation at the if statement, because then it plus 1762 * the then and else blocks get printed. 1763 */ 1764 this->base_ir = ir->condition; 1765 1766 if (intel->gen >= 6) { 1767 emit_if_gen6(ir); 1768 } else { 1769 emit_bool_to_cond_code(ir->condition); 1770 1771 inst = emit(BRW_OPCODE_IF); 1772 inst->predicated = true; 1773 } 1774 1775 foreach_iter(exec_list_iterator, iter, ir->then_instructions) { 1776 ir_instruction *ir = (ir_instruction *)iter.get(); 1777 this->base_ir = ir; 1778 1779 ir->accept(this); 1780 } 1781 1782 if (!ir->else_instructions.is_empty()) { 1783 emit(BRW_OPCODE_ELSE); 1784 1785 foreach_iter(exec_list_iterator, iter, ir->else_instructions) { 1786 ir_instruction *ir = (ir_instruction *)iter.get(); 1787 this->base_ir = ir; 1788 1789 ir->accept(this); 1790 } 1791 } 1792 1793 emit(BRW_OPCODE_ENDIF); 1794} 1795 1796void 1797fs_visitor::visit(ir_loop *ir) 1798{ 1799 fs_reg counter = reg_undef; 1800 1801 if (ir->counter) { 1802 this->base_ir = ir->counter; 1803 ir->counter->accept(this); 1804 counter = *(variable_storage(ir->counter)); 1805 1806 if (ir->from) { 1807 this->base_ir = ir->from; 1808 ir->from->accept(this); 1809 1810 emit(BRW_OPCODE_MOV, counter, this->result); 1811 } 1812 } 1813 1814 emit(BRW_OPCODE_DO); 1815 1816 if (ir->to) { 1817 this->base_ir = ir->to; 1818 ir->to->accept(this); 1819 1820 fs_inst *inst = emit(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result); 1821 inst->conditional_mod = brw_conditional_for_comparison(ir->cmp); 1822 1823 inst = emit(BRW_OPCODE_BREAK); 1824 inst->predicated = true; 1825 } 1826 1827 foreach_iter(exec_list_iterator, iter, ir->body_instructions) { 1828 ir_instruction *ir = (ir_instruction *)iter.get(); 1829 1830 this->base_ir = ir; 1831 ir->accept(this); 1832 } 1833 1834 if (ir->increment) { 1835 this->base_ir = ir->increment; 1836 ir->increment->accept(this); 1837 emit(BRW_OPCODE_ADD, counter, counter, this->result); 1838 } 1839 1840 emit(BRW_OPCODE_WHILE); 1841} 1842 1843void 1844fs_visitor::visit(ir_loop_jump *ir) 1845{ 1846 switch (ir->mode) { 1847 case ir_loop_jump::jump_break: 1848 emit(BRW_OPCODE_BREAK); 1849 break; 1850 case ir_loop_jump::jump_continue: 1851 emit(BRW_OPCODE_CONTINUE); 1852 break; 1853 } 1854} 1855 1856void 1857fs_visitor::visit(ir_call *ir) 1858{ 1859 assert(!"FINISHME"); 1860} 1861 1862void 1863fs_visitor::visit(ir_return *ir) 1864{ 1865 assert(!"FINISHME"); 1866} 1867 1868void 1869fs_visitor::visit(ir_function *ir) 1870{ 1871 /* Ignore function bodies other than main() -- we shouldn't see calls to 1872 * them since they should all be inlined before we get to ir_to_mesa. 1873 */ 1874 if (strcmp(ir->name, "main") == 0) { 1875 const ir_function_signature *sig; 1876 exec_list empty; 1877 1878 sig = ir->matching_signature(&empty); 1879 1880 assert(sig); 1881 1882 foreach_iter(exec_list_iterator, iter, sig->body) { 1883 ir_instruction *ir = (ir_instruction *)iter.get(); 1884 this->base_ir = ir; 1885 1886 ir->accept(this); 1887 } 1888 } 1889} 1890 1891void 1892fs_visitor::visit(ir_function_signature *ir) 1893{ 1894 assert(!"not reached"); 1895 (void)ir; 1896} 1897 1898fs_inst * 1899fs_visitor::emit(fs_inst inst) 1900{ 1901 fs_inst *list_inst = new(mem_ctx) fs_inst; 1902 *list_inst = inst; 1903 1904 list_inst->annotation = this->current_annotation; 1905 list_inst->ir = this->base_ir; 1906 1907 this->instructions.push_tail(list_inst); 1908 1909 return list_inst; 1910} 1911 1912/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */ 1913void 1914fs_visitor::emit_dummy_fs() 1915{ 1916 /* Everyone's favorite color. */ 1917 emit(BRW_OPCODE_MOV, fs_reg(MRF, 2), fs_reg(1.0f)); 1918 emit(BRW_OPCODE_MOV, fs_reg(MRF, 3), fs_reg(0.0f)); 1919 emit(BRW_OPCODE_MOV, fs_reg(MRF, 4), fs_reg(1.0f)); 1920 emit(BRW_OPCODE_MOV, fs_reg(MRF, 5), fs_reg(0.0f)); 1921 1922 fs_inst *write; 1923 write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0)); 1924 write->base_mrf = 0; 1925} 1926 1927/* The register location here is relative to the start of the URB 1928 * data. It will get adjusted to be a real location before 1929 * generate_code() time. 1930 */ 1931struct brw_reg 1932fs_visitor::interp_reg(int location, int channel) 1933{ 1934 int regnr = urb_setup[location] * 2 + channel / 2; 1935 int stride = (channel & 1) * 4; 1936 1937 assert(urb_setup[location] != -1); 1938 1939 return brw_vec1_grf(regnr, stride); 1940} 1941 1942/** Emits the interpolation for the varying inputs. */ 1943void 1944fs_visitor::emit_interpolation_setup_gen4() 1945{ 1946 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW); 1947 1948 this->current_annotation = "compute pixel centers"; 1949 this->pixel_x = fs_reg(this, glsl_type::uint_type); 1950 this->pixel_y = fs_reg(this, glsl_type::uint_type); 1951 this->pixel_x.type = BRW_REGISTER_TYPE_UW; 1952 this->pixel_y.type = BRW_REGISTER_TYPE_UW; 1953 emit(BRW_OPCODE_ADD, 1954 this->pixel_x, 1955 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)), 1956 fs_reg(brw_imm_v(0x10101010))); 1957 emit(BRW_OPCODE_ADD, 1958 this->pixel_y, 1959 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)), 1960 fs_reg(brw_imm_v(0x11001100))); 1961 1962 this->current_annotation = "compute pixel deltas from v0"; 1963 if (brw->has_pln) { 1964 this->delta_x = fs_reg(this, glsl_type::vec2_type); 1965 this->delta_y = this->delta_x; 1966 this->delta_y.reg_offset++; 1967 } else { 1968 this->delta_x = fs_reg(this, glsl_type::float_type); 1969 this->delta_y = fs_reg(this, glsl_type::float_type); 1970 } 1971 emit(BRW_OPCODE_ADD, this->delta_x, 1972 this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0)))); 1973 emit(BRW_OPCODE_ADD, this->delta_y, 1974 this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1)))); 1975 1976 this->current_annotation = "compute pos.w and 1/pos.w"; 1977 /* Compute wpos.w. It's always in our setup, since it's needed to 1978 * interpolate the other attributes. 1979 */ 1980 this->wpos_w = fs_reg(this, glsl_type::float_type); 1981 emit(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y, 1982 interp_reg(FRAG_ATTRIB_WPOS, 3)); 1983 /* Compute the pixel 1/W value from wpos.w. */ 1984 this->pixel_w = fs_reg(this, glsl_type::float_type); 1985 emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w); 1986 this->current_annotation = NULL; 1987} 1988 1989/** Emits the interpolation for the varying inputs. */ 1990void 1991fs_visitor::emit_interpolation_setup_gen6() 1992{ 1993 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW); 1994 1995 /* If the pixel centers end up used, the setup is the same as for gen4. */ 1996 this->current_annotation = "compute pixel centers"; 1997 fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type); 1998 fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type); 1999 int_pixel_x.type = BRW_REGISTER_TYPE_UW; 2000 int_pixel_y.type = BRW_REGISTER_TYPE_UW; 2001 emit(BRW_OPCODE_ADD, 2002 int_pixel_x, 2003 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)), 2004 fs_reg(brw_imm_v(0x10101010))); 2005 emit(BRW_OPCODE_ADD, 2006 int_pixel_y, 2007 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)), 2008 fs_reg(brw_imm_v(0x11001100))); 2009 2010 /* As of gen6, we can no longer mix float and int sources. We have 2011 * to turn the integer pixel centers into floats for their actual 2012 * use. 2013 */ 2014 this->pixel_x = fs_reg(this, glsl_type::float_type); 2015 this->pixel_y = fs_reg(this, glsl_type::float_type); 2016 emit(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x); 2017 emit(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y); 2018 2019 this->current_annotation = "compute 1/pos.w"; 2020 this->wpos_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0)); 2021 this->pixel_w = fs_reg(this, glsl_type::float_type); 2022 emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w); 2023 2024 this->delta_x = fs_reg(brw_vec8_grf(2, 0)); 2025 this->delta_y = fs_reg(brw_vec8_grf(3, 0)); 2026 2027 this->current_annotation = NULL; 2028} 2029 2030void 2031fs_visitor::emit_fb_writes() 2032{ 2033 this->current_annotation = "FB write header"; 2034 GLboolean header_present = GL_TRUE; 2035 int nr = 0; 2036 2037 if (intel->gen >= 6 && 2038 !this->kill_emitted && 2039 c->key.nr_color_regions == 1) { 2040 header_present = false; 2041 } 2042 2043 if (header_present) { 2044 /* m0, m1 header */ 2045 nr += 2; 2046 } 2047 2048 if (c->aa_dest_stencil_reg) { 2049 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), 2050 fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0))); 2051 } 2052 2053 /* Reserve space for color. It'll be filled in per MRT below. */ 2054 int color_mrf = nr; 2055 nr += 4; 2056 2057 if (c->source_depth_to_render_target) { 2058 if (c->computes_depth) { 2059 /* Hand over gl_FragDepth. */ 2060 assert(this->frag_depth); 2061 fs_reg depth = *(variable_storage(this->frag_depth)); 2062 2063 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth); 2064 } else { 2065 /* Pass through the payload depth. */ 2066 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), 2067 fs_reg(brw_vec8_grf(c->source_depth_reg, 0))); 2068 } 2069 } 2070 2071 if (c->dest_depth_reg) { 2072 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), 2073 fs_reg(brw_vec8_grf(c->dest_depth_reg, 0))); 2074 } 2075 2076 fs_reg color = reg_undef; 2077 if (this->frag_color) 2078 color = *(variable_storage(this->frag_color)); 2079 else if (this->frag_data) { 2080 color = *(variable_storage(this->frag_data)); 2081 color.type = BRW_REGISTER_TYPE_F; 2082 } 2083 2084 for (int target = 0; target < c->key.nr_color_regions; target++) { 2085 this->current_annotation = ralloc_asprintf(this->mem_ctx, 2086 "FB write target %d", 2087 target); 2088 if (this->frag_color || this->frag_data) { 2089 for (int i = 0; i < 4; i++) { 2090 emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + i), color); 2091 color.reg_offset++; 2092 } 2093 } 2094 2095 if (this->frag_color) 2096 color.reg_offset -= 4; 2097 2098 fs_inst *inst = emit(FS_OPCODE_FB_WRITE); 2099 inst->target = target; 2100 inst->base_mrf = 0; 2101 inst->mlen = nr; 2102 if (target == c->key.nr_color_regions - 1) 2103 inst->eot = true; 2104 inst->header_present = header_present; 2105 } 2106 2107 if (c->key.nr_color_regions == 0) { 2108 if (c->key.alpha_test && (this->frag_color || this->frag_data)) { 2109 /* If the alpha test is enabled but there's no color buffer, 2110 * we still need to send alpha out the pipeline to our null 2111 * renderbuffer. 2112 */ 2113 color.reg_offset += 3; 2114 emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + 3), color); 2115 } 2116 2117 fs_inst *inst = emit(FS_OPCODE_FB_WRITE); 2118 inst->base_mrf = 0; 2119 inst->mlen = nr; 2120 inst->eot = true; 2121 inst->header_present = header_present; 2122 } 2123 2124 this->current_annotation = NULL; 2125} 2126 2127void 2128fs_visitor::generate_fb_write(fs_inst *inst) 2129{ 2130 GLboolean eot = inst->eot; 2131 struct brw_reg implied_header; 2132 2133 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied 2134 * move, here's g1. 2135 */ 2136 brw_push_insn_state(p); 2137 brw_set_mask_control(p, BRW_MASK_DISABLE); 2138 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2139 2140 if (inst->header_present) { 2141 if (intel->gen >= 6) { 2142 brw_MOV(p, 2143 brw_message_reg(inst->base_mrf), 2144 brw_vec8_grf(0, 0)); 2145 2146 if (inst->target > 0) { 2147 /* Set the render target index for choosing BLEND_STATE. */ 2148 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2), 2149 BRW_REGISTER_TYPE_UD), 2150 brw_imm_ud(inst->target)); 2151 } 2152 2153 /* Clear viewport index, render target array index. */ 2154 brw_AND(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 0), 2155 BRW_REGISTER_TYPE_UD), 2156 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), 2157 brw_imm_ud(0xf7ff)); 2158 2159 implied_header = brw_null_reg(); 2160 } else { 2161 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); 2162 } 2163 2164 brw_MOV(p, 2165 brw_message_reg(inst->base_mrf + 1), 2166 brw_vec8_grf(1, 0)); 2167 } else { 2168 implied_header = brw_null_reg(); 2169 } 2170 2171 brw_pop_insn_state(p); 2172 2173 brw_fb_WRITE(p, 2174 8, /* dispatch_width */ 2175 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW), 2176 inst->base_mrf, 2177 implied_header, 2178 inst->target, 2179 inst->mlen, 2180 0, 2181 eot, 2182 inst->header_present); 2183} 2184 2185void 2186fs_visitor::generate_linterp(fs_inst *inst, 2187 struct brw_reg dst, struct brw_reg *src) 2188{ 2189 struct brw_reg delta_x = src[0]; 2190 struct brw_reg delta_y = src[1]; 2191 struct brw_reg interp = src[2]; 2192 2193 if (brw->has_pln && 2194 delta_y.nr == delta_x.nr + 1 && 2195 (intel->gen >= 6 || (delta_x.nr & 1) == 0)) { 2196 brw_PLN(p, dst, interp, delta_x); 2197 } else { 2198 brw_LINE(p, brw_null_reg(), interp, delta_x); 2199 brw_MAC(p, dst, suboffset(interp, 1), delta_y); 2200 } 2201} 2202 2203void 2204fs_visitor::generate_math(fs_inst *inst, 2205 struct brw_reg dst, struct brw_reg *src) 2206{ 2207 int op; 2208 2209 switch (inst->opcode) { 2210 case FS_OPCODE_RCP: 2211 op = BRW_MATH_FUNCTION_INV; 2212 break; 2213 case FS_OPCODE_RSQ: 2214 op = BRW_MATH_FUNCTION_RSQ; 2215 break; 2216 case FS_OPCODE_SQRT: 2217 op = BRW_MATH_FUNCTION_SQRT; 2218 break; 2219 case FS_OPCODE_EXP2: 2220 op = BRW_MATH_FUNCTION_EXP; 2221 break; 2222 case FS_OPCODE_LOG2: 2223 op = BRW_MATH_FUNCTION_LOG; 2224 break; 2225 case FS_OPCODE_POW: 2226 op = BRW_MATH_FUNCTION_POW; 2227 break; 2228 case FS_OPCODE_SIN: 2229 op = BRW_MATH_FUNCTION_SIN; 2230 break; 2231 case FS_OPCODE_COS: 2232 op = BRW_MATH_FUNCTION_COS; 2233 break; 2234 default: 2235 assert(!"not reached: unknown math function"); 2236 op = 0; 2237 break; 2238 } 2239 2240 if (intel->gen >= 6) { 2241 assert(inst->mlen == 0); 2242 2243 if (inst->opcode == FS_OPCODE_POW) { 2244 brw_math2(p, dst, op, src[0], src[1]); 2245 } else { 2246 brw_math(p, dst, 2247 op, 2248 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 2249 BRW_MATH_SATURATE_NONE, 2250 0, src[0], 2251 BRW_MATH_DATA_VECTOR, 2252 BRW_MATH_PRECISION_FULL); 2253 } 2254 } else { 2255 assert(inst->mlen >= 1); 2256 2257 brw_math(p, dst, 2258 op, 2259 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 2260 BRW_MATH_SATURATE_NONE, 2261 inst->base_mrf, src[0], 2262 BRW_MATH_DATA_VECTOR, 2263 BRW_MATH_PRECISION_FULL); 2264 } 2265} 2266 2267void 2268fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 2269{ 2270 int msg_type = -1; 2271 int rlen = 4; 2272 uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; 2273 2274 if (intel->gen >= 5) { 2275 switch (inst->opcode) { 2276 case FS_OPCODE_TEX: 2277 if (inst->shadow_compare) { 2278 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE; 2279 } else { 2280 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE; 2281 } 2282 break; 2283 case FS_OPCODE_TXB: 2284 if (inst->shadow_compare) { 2285 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE; 2286 } else { 2287 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS; 2288 } 2289 break; 2290 case FS_OPCODE_TXL: 2291 if (inst->shadow_compare) { 2292 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE; 2293 } else { 2294 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD; 2295 } 2296 break; 2297 case FS_OPCODE_TXD: 2298 assert(!"TXD isn't supported on gen5+ yet."); 2299 break; 2300 } 2301 } else { 2302 switch (inst->opcode) { 2303 case FS_OPCODE_TEX: 2304 /* Note that G45 and older determines shadow compare and dispatch width 2305 * from message length for most messages. 2306 */ 2307 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE; 2308 if (inst->shadow_compare) { 2309 assert(inst->mlen == 6); 2310 } else { 2311 assert(inst->mlen <= 4); 2312 } 2313 break; 2314 case FS_OPCODE_TXB: 2315 if (inst->shadow_compare) { 2316 assert(inst->mlen == 6); 2317 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE; 2318 } else { 2319 assert(inst->mlen == 9); 2320 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS; 2321 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 2322 } 2323 break; 2324 case FS_OPCODE_TXL: 2325 if (inst->shadow_compare) { 2326 assert(inst->mlen == 6); 2327 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE; 2328 } else { 2329 assert(inst->mlen == 9); 2330 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD; 2331 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 2332 } 2333 break; 2334 case FS_OPCODE_TXD: 2335 assert(!"TXD isn't supported on gen4 yet."); 2336 break; 2337 } 2338 } 2339 assert(msg_type != -1); 2340 2341 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) { 2342 rlen = 8; 2343 dst = vec16(dst); 2344 } 2345 2346 brw_SAMPLE(p, 2347 retype(dst, BRW_REGISTER_TYPE_UW), 2348 inst->base_mrf, 2349 src, 2350 SURF_INDEX_TEXTURE(inst->sampler), 2351 inst->sampler, 2352 WRITEMASK_XYZW, 2353 msg_type, 2354 rlen, 2355 inst->mlen, 2356 0, 2357 1, 2358 simd_mode); 2359} 2360 2361 2362/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input 2363 * looking like: 2364 * 2365 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br 2366 * 2367 * and we're trying to produce: 2368 * 2369 * DDX DDY 2370 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl) 2371 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br) 2372 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl) 2373 * (ss0.br - ss0.bl) (ss0.tr - ss0.br) 2374 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl) 2375 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br) 2376 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl) 2377 * (ss1.br - ss1.bl) (ss1.tr - ss1.br) 2378 * 2379 * and add another set of two more subspans if in 16-pixel dispatch mode. 2380 * 2381 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result 2382 * for each pair, and vertstride = 2 jumps us 2 elements after processing a 2383 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled 2384 * between each other. We could probably do it like ddx and swizzle the right 2385 * order later, but bail for now and just produce 2386 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4) 2387 */ 2388void 2389fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 2390{ 2391 struct brw_reg src0 = brw_reg(src.file, src.nr, 1, 2392 BRW_REGISTER_TYPE_F, 2393 BRW_VERTICAL_STRIDE_2, 2394 BRW_WIDTH_2, 2395 BRW_HORIZONTAL_STRIDE_0, 2396 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2397 struct brw_reg src1 = brw_reg(src.file, src.nr, 0, 2398 BRW_REGISTER_TYPE_F, 2399 BRW_VERTICAL_STRIDE_2, 2400 BRW_WIDTH_2, 2401 BRW_HORIZONTAL_STRIDE_0, 2402 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2403 brw_ADD(p, dst, src0, negate(src1)); 2404} 2405 2406void 2407fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 2408{ 2409 struct brw_reg src0 = brw_reg(src.file, src.nr, 0, 2410 BRW_REGISTER_TYPE_F, 2411 BRW_VERTICAL_STRIDE_4, 2412 BRW_WIDTH_4, 2413 BRW_HORIZONTAL_STRIDE_0, 2414 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2415 struct brw_reg src1 = brw_reg(src.file, src.nr, 2, 2416 BRW_REGISTER_TYPE_F, 2417 BRW_VERTICAL_STRIDE_4, 2418 BRW_WIDTH_4, 2419 BRW_HORIZONTAL_STRIDE_0, 2420 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2421 brw_ADD(p, dst, src0, negate(src1)); 2422} 2423 2424void 2425fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask) 2426{ 2427 if (intel->gen >= 6) { 2428 /* Gen6 no longer has the mask reg for us to just read the 2429 * active channels from. However, cmp updates just the channels 2430 * of the flag reg that are enabled, so we can get at the 2431 * channel enables that way. In this step, make a reg of ones 2432 * we'll compare to. 2433 */ 2434 brw_MOV(p, mask, brw_imm_ud(1)); 2435 } else { 2436 brw_push_insn_state(p); 2437 brw_set_mask_control(p, BRW_MASK_DISABLE); 2438 brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */ 2439 brw_pop_insn_state(p); 2440 } 2441} 2442 2443void 2444fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask) 2445{ 2446 if (intel->gen >= 6) { 2447 struct brw_reg f0 = brw_flag_reg(); 2448 struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); 2449 2450 brw_push_insn_state(p); 2451 brw_set_mask_control(p, BRW_MASK_DISABLE); 2452 brw_MOV(p, f0, brw_imm_uw(0xffff)); /* inactive channels undiscarded */ 2453 brw_pop_insn_state(p); 2454 2455 brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), 2456 BRW_CONDITIONAL_Z, mask, brw_imm_ud(0)); /* active channels fail test */ 2457 /* Undo CMP's whacking of predication*/ 2458 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 2459 2460 brw_push_insn_state(p); 2461 brw_set_mask_control(p, BRW_MASK_DISABLE); 2462 brw_AND(p, g1, f0, g1); 2463 brw_pop_insn_state(p); 2464 } else { 2465 struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); 2466 2467 mask = brw_uw1_reg(mask.file, mask.nr, 0); 2468 2469 brw_push_insn_state(p); 2470 brw_set_mask_control(p, BRW_MASK_DISABLE); 2471 brw_AND(p, g0, mask, g0); 2472 brw_pop_insn_state(p); 2473 } 2474} 2475 2476void 2477fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src) 2478{ 2479 assert(inst->mlen != 0); 2480 2481 brw_MOV(p, 2482 retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD), 2483 retype(src, BRW_REGISTER_TYPE_UD)); 2484 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1, 2485 inst->offset); 2486} 2487 2488void 2489fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst) 2490{ 2491 assert(inst->mlen != 0); 2492 2493 /* Clear any post destination dependencies that would be ignored by 2494 * the block read. See the B-Spec for pre-gen5 send instruction. 2495 * 2496 * This could use a better solution, since texture sampling and 2497 * math reads could potentially run into it as well -- anywhere 2498 * that we have a SEND with a destination that is a register that 2499 * was written but not read within the last N instructions (what's 2500 * N? unsure). This is rare because of dead code elimination, but 2501 * not impossible. 2502 */ 2503 if (intel->gen == 4 && !intel->is_g4x) 2504 brw_MOV(p, brw_null_reg(), dst); 2505 2506 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1, 2507 inst->offset); 2508 2509 if (intel->gen == 4 && !intel->is_g4x) { 2510 /* gen4 errata: destination from a send can't be used as a 2511 * destination until it's been read. Just read it so we don't 2512 * have to worry. 2513 */ 2514 brw_MOV(p, brw_null_reg(), dst); 2515 } 2516} 2517 2518 2519void 2520fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst) 2521{ 2522 assert(inst->mlen != 0); 2523 2524 /* Clear any post destination dependencies that would be ignored by 2525 * the block read. See the B-Spec for pre-gen5 send instruction. 2526 * 2527 * This could use a better solution, since texture sampling and 2528 * math reads could potentially run into it as well -- anywhere 2529 * that we have a SEND with a destination that is a register that 2530 * was written but not read within the last N instructions (what's 2531 * N? unsure). This is rare because of dead code elimination, but 2532 * not impossible. 2533 */ 2534 if (intel->gen == 4 && !intel->is_g4x) 2535 brw_MOV(p, brw_null_reg(), dst); 2536 2537 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf), 2538 inst->offset, SURF_INDEX_FRAG_CONST_BUFFER); 2539 2540 if (intel->gen == 4 && !intel->is_g4x) { 2541 /* gen4 errata: destination from a send can't be used as a 2542 * destination until it's been read. Just read it so we don't 2543 * have to worry. 2544 */ 2545 brw_MOV(p, brw_null_reg(), dst); 2546 } 2547} 2548 2549/** 2550 * To be called after the last _mesa_add_state_reference() call, to 2551 * set up prog_data.param[] for assign_curb_setup() and 2552 * setup_pull_constants(). 2553 */ 2554void 2555fs_visitor::setup_paramvalues_refs() 2556{ 2557 /* Set up the pointers to ParamValues now that that array is finalized. */ 2558 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 2559 c->prog_data.param[i] = 2560 fp->Base.Parameters->ParameterValues[this->param_index[i]] + 2561 this->param_offset[i]; 2562 } 2563} 2564 2565void 2566fs_visitor::assign_curb_setup() 2567{ 2568 c->prog_data.first_curbe_grf = c->nr_payload_regs; 2569 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8; 2570 2571 /* Map the offsets in the UNIFORM file to fixed HW regs. */ 2572 foreach_iter(exec_list_iterator, iter, this->instructions) { 2573 fs_inst *inst = (fs_inst *)iter.get(); 2574 2575 for (unsigned int i = 0; i < 3; i++) { 2576 if (inst->src[i].file == UNIFORM) { 2577 int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 2578 struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf + 2579 constant_nr / 8, 2580 constant_nr % 8); 2581 2582 inst->src[i].file = FIXED_HW_REG; 2583 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type); 2584 } 2585 } 2586 } 2587} 2588 2589void 2590fs_visitor::calculate_urb_setup() 2591{ 2592 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 2593 urb_setup[i] = -1; 2594 } 2595 2596 int urb_next = 0; 2597 /* Figure out where each of the incoming setup attributes lands. */ 2598 if (intel->gen >= 6) { 2599 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 2600 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) { 2601 urb_setup[i] = urb_next++; 2602 } 2603 } 2604 } else { 2605 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */ 2606 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) { 2607 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) { 2608 int fp_index; 2609 2610 if (i >= VERT_RESULT_VAR0) 2611 fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0); 2612 else if (i <= VERT_RESULT_TEX7) 2613 fp_index = i; 2614 else 2615 fp_index = -1; 2616 2617 if (fp_index >= 0) 2618 urb_setup[fp_index] = urb_next++; 2619 } 2620 } 2621 } 2622 2623 /* Each attribute is 4 setup channels, each of which is half a reg. */ 2624 c->prog_data.urb_read_length = urb_next * 2; 2625} 2626 2627void 2628fs_visitor::assign_urb_setup() 2629{ 2630 int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length; 2631 2632 /* Offset all the urb_setup[] index by the actual position of the 2633 * setup regs, now that the location of the constants has been chosen. 2634 */ 2635 foreach_iter(exec_list_iterator, iter, this->instructions) { 2636 fs_inst *inst = (fs_inst *)iter.get(); 2637 2638 if (inst->opcode == FS_OPCODE_LINTERP) { 2639 assert(inst->src[2].file == FIXED_HW_REG); 2640 inst->src[2].fixed_hw_reg.nr += urb_start; 2641 } 2642 2643 if (inst->opcode == FS_OPCODE_CINTERP) { 2644 assert(inst->src[0].file == FIXED_HW_REG); 2645 inst->src[0].fixed_hw_reg.nr += urb_start; 2646 } 2647 } 2648 2649 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length; 2650} 2651 2652/** 2653 * Split large virtual GRFs into separate components if we can. 2654 * 2655 * This is mostly duplicated with what brw_fs_vector_splitting does, 2656 * but that's really conservative because it's afraid of doing 2657 * splitting that doesn't result in real progress after the rest of 2658 * the optimization phases, which would cause infinite looping in 2659 * optimization. We can do it once here, safely. This also has the 2660 * opportunity to split interpolated values, or maybe even uniforms, 2661 * which we don't have at the IR level. 2662 * 2663 * We want to split, because virtual GRFs are what we register 2664 * allocate and spill (due to contiguousness requirements for some 2665 * instructions), and they're what we naturally generate in the 2666 * codegen process, but most virtual GRFs don't actually need to be 2667 * contiguous sets of GRFs. If we split, we'll end up with reduced 2668 * live intervals and better dead code elimination and coalescing. 2669 */ 2670void 2671fs_visitor::split_virtual_grfs() 2672{ 2673 int num_vars = this->virtual_grf_next; 2674 bool split_grf[num_vars]; 2675 int new_virtual_grf[num_vars]; 2676 2677 /* Try to split anything > 0 sized. */ 2678 for (int i = 0; i < num_vars; i++) { 2679 if (this->virtual_grf_sizes[i] != 1) 2680 split_grf[i] = true; 2681 else 2682 split_grf[i] = false; 2683 } 2684 2685 if (brw->has_pln) { 2686 /* PLN opcodes rely on the delta_xy being contiguous. */ 2687 split_grf[this->delta_x.reg] = false; 2688 } 2689 2690 foreach_iter(exec_list_iterator, iter, this->instructions) { 2691 fs_inst *inst = (fs_inst *)iter.get(); 2692 2693 /* Texturing produces 4 contiguous registers, so no splitting. */ 2694 if (inst->is_tex()) { 2695 split_grf[inst->dst.reg] = false; 2696 } 2697 } 2698 2699 /* Allocate new space for split regs. Note that the virtual 2700 * numbers will be contiguous. 2701 */ 2702 for (int i = 0; i < num_vars; i++) { 2703 if (split_grf[i]) { 2704 new_virtual_grf[i] = virtual_grf_alloc(1); 2705 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) { 2706 int reg = virtual_grf_alloc(1); 2707 assert(reg == new_virtual_grf[i] + j - 1); 2708 (void) reg; 2709 } 2710 this->virtual_grf_sizes[i] = 1; 2711 } 2712 } 2713 2714 foreach_iter(exec_list_iterator, iter, this->instructions) { 2715 fs_inst *inst = (fs_inst *)iter.get(); 2716 2717 if (inst->dst.file == GRF && 2718 split_grf[inst->dst.reg] && 2719 inst->dst.reg_offset != 0) { 2720 inst->dst.reg = (new_virtual_grf[inst->dst.reg] + 2721 inst->dst.reg_offset - 1); 2722 inst->dst.reg_offset = 0; 2723 } 2724 for (int i = 0; i < 3; i++) { 2725 if (inst->src[i].file == GRF && 2726 split_grf[inst->src[i].reg] && 2727 inst->src[i].reg_offset != 0) { 2728 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] + 2729 inst->src[i].reg_offset - 1); 2730 inst->src[i].reg_offset = 0; 2731 } 2732 } 2733 } 2734 this->live_intervals_valid = false; 2735} 2736 2737/** 2738 * Choose accesses from the UNIFORM file to demote to using the pull 2739 * constant buffer. 2740 * 2741 * We allow a fragment shader to have more than the specified minimum 2742 * maximum number of fragment shader uniform components (64). If 2743 * there are too many of these, they'd fill up all of register space. 2744 * So, this will push some of them out to the pull constant buffer and 2745 * update the program to load them. 2746 */ 2747void 2748fs_visitor::setup_pull_constants() 2749{ 2750 /* Only allow 16 registers (128 uniform components) as push constants. */ 2751 unsigned int max_uniform_components = 16 * 8; 2752 if (c->prog_data.nr_params <= max_uniform_components) 2753 return; 2754 2755 /* Just demote the end of the list. We could probably do better 2756 * here, demoting things that are rarely used in the program first. 2757 */ 2758 int pull_uniform_base = max_uniform_components; 2759 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base; 2760 2761 foreach_iter(exec_list_iterator, iter, this->instructions) { 2762 fs_inst *inst = (fs_inst *)iter.get(); 2763 2764 for (int i = 0; i < 3; i++) { 2765 if (inst->src[i].file != UNIFORM) 2766 continue; 2767 2768 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 2769 if (uniform_nr < pull_uniform_base) 2770 continue; 2771 2772 fs_reg dst = fs_reg(this, glsl_type::float_type); 2773 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD, 2774 dst); 2775 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15; 2776 pull->ir = inst->ir; 2777 pull->annotation = inst->annotation; 2778 pull->base_mrf = 14; 2779 pull->mlen = 1; 2780 2781 inst->insert_before(pull); 2782 2783 inst->src[i].file = GRF; 2784 inst->src[i].reg = dst.reg; 2785 inst->src[i].reg_offset = 0; 2786 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3; 2787 } 2788 } 2789 2790 for (int i = 0; i < pull_uniform_count; i++) { 2791 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i]; 2792 c->prog_data.pull_param_convert[i] = 2793 c->prog_data.param_convert[pull_uniform_base + i]; 2794 } 2795 c->prog_data.nr_params -= pull_uniform_count; 2796 c->prog_data.nr_pull_params = pull_uniform_count; 2797} 2798 2799void 2800fs_visitor::calculate_live_intervals() 2801{ 2802 int num_vars = this->virtual_grf_next; 2803 int *def = ralloc_array(mem_ctx, int, num_vars); 2804 int *use = ralloc_array(mem_ctx, int, num_vars); 2805 int loop_depth = 0; 2806 int loop_start = 0; 2807 int bb_header_ip = 0; 2808 2809 if (this->live_intervals_valid) 2810 return; 2811 2812 for (int i = 0; i < num_vars; i++) { 2813 def[i] = MAX_INSTRUCTION; 2814 use[i] = -1; 2815 } 2816 2817 int ip = 0; 2818 foreach_iter(exec_list_iterator, iter, this->instructions) { 2819 fs_inst *inst = (fs_inst *)iter.get(); 2820 2821 if (inst->opcode == BRW_OPCODE_DO) { 2822 if (loop_depth++ == 0) 2823 loop_start = ip; 2824 } else if (inst->opcode == BRW_OPCODE_WHILE) { 2825 loop_depth--; 2826 2827 if (loop_depth == 0) { 2828 /* Patches up the use of vars marked for being live across 2829 * the whole loop. 2830 */ 2831 for (int i = 0; i < num_vars; i++) { 2832 if (use[i] == loop_start) { 2833 use[i] = ip; 2834 } 2835 } 2836 } 2837 } else { 2838 for (unsigned int i = 0; i < 3; i++) { 2839 if (inst->src[i].file == GRF && inst->src[i].reg != 0) { 2840 int reg = inst->src[i].reg; 2841 2842 if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 && 2843 def[reg] >= bb_header_ip)) { 2844 use[reg] = ip; 2845 } else { 2846 def[reg] = MIN2(loop_start, def[reg]); 2847 use[reg] = loop_start; 2848 2849 /* Nobody else is going to go smash our start to 2850 * later in the loop now, because def[reg] now 2851 * points before the bb header. 2852 */ 2853 } 2854 } 2855 } 2856 if (inst->dst.file == GRF && inst->dst.reg != 0) { 2857 int reg = inst->dst.reg; 2858 2859 if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 && 2860 !inst->predicated)) { 2861 def[reg] = MIN2(def[reg], ip); 2862 } else { 2863 def[reg] = MIN2(def[reg], loop_start); 2864 } 2865 } 2866 } 2867 2868 ip++; 2869 2870 /* Set the basic block header IP. This is used for determining 2871 * if a complete def of single-register virtual GRF in a loop 2872 * dominates a use in the same basic block. It's a quick way to 2873 * reduce the live interval range of most register used in a 2874 * loop. 2875 */ 2876 if (inst->opcode == BRW_OPCODE_IF || 2877 inst->opcode == BRW_OPCODE_ELSE || 2878 inst->opcode == BRW_OPCODE_ENDIF || 2879 inst->opcode == BRW_OPCODE_DO || 2880 inst->opcode == BRW_OPCODE_WHILE || 2881 inst->opcode == BRW_OPCODE_BREAK || 2882 inst->opcode == BRW_OPCODE_CONTINUE) { 2883 bb_header_ip = ip; 2884 } 2885 } 2886 2887 ralloc_free(this->virtual_grf_def); 2888 ralloc_free(this->virtual_grf_use); 2889 this->virtual_grf_def = def; 2890 this->virtual_grf_use = use; 2891 2892 this->live_intervals_valid = true; 2893} 2894 2895/** 2896 * Attempts to move immediate constants into the immediate 2897 * constant slot of following instructions. 2898 * 2899 * Immediate constants are a bit tricky -- they have to be in the last 2900 * operand slot, you can't do abs/negate on them, 2901 */ 2902 2903bool 2904fs_visitor::propagate_constants() 2905{ 2906 bool progress = false; 2907 2908 calculate_live_intervals(); 2909 2910 foreach_iter(exec_list_iterator, iter, this->instructions) { 2911 fs_inst *inst = (fs_inst *)iter.get(); 2912 2913 if (inst->opcode != BRW_OPCODE_MOV || 2914 inst->predicated || 2915 inst->dst.file != GRF || inst->src[0].file != IMM || 2916 inst->dst.type != inst->src[0].type) 2917 continue; 2918 2919 /* Don't bother with cases where we should have had the 2920 * operation on the constant folded in GLSL already. 2921 */ 2922 if (inst->saturate) 2923 continue; 2924 2925 /* Found a move of a constant to a GRF. Find anything else using the GRF 2926 * before it's written, and replace it with the constant if we can. 2927 */ 2928 exec_list_iterator scan_iter = iter; 2929 scan_iter.next(); 2930 for (; scan_iter.has_next(); scan_iter.next()) { 2931 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 2932 2933 if (scan_inst->opcode == BRW_OPCODE_DO || 2934 scan_inst->opcode == BRW_OPCODE_WHILE || 2935 scan_inst->opcode == BRW_OPCODE_ELSE || 2936 scan_inst->opcode == BRW_OPCODE_ENDIF) { 2937 break; 2938 } 2939 2940 for (int i = 2; i >= 0; i--) { 2941 if (scan_inst->src[i].file != GRF || 2942 scan_inst->src[i].reg != inst->dst.reg || 2943 scan_inst->src[i].reg_offset != inst->dst.reg_offset) 2944 continue; 2945 2946 /* Don't bother with cases where we should have had the 2947 * operation on the constant folded in GLSL already. 2948 */ 2949 if (scan_inst->src[i].negate || scan_inst->src[i].abs) 2950 continue; 2951 2952 switch (scan_inst->opcode) { 2953 case BRW_OPCODE_MOV: 2954 scan_inst->src[i] = inst->src[0]; 2955 progress = true; 2956 break; 2957 2958 case BRW_OPCODE_MUL: 2959 case BRW_OPCODE_ADD: 2960 if (i == 1) { 2961 scan_inst->src[i] = inst->src[0]; 2962 progress = true; 2963 } else if (i == 0 && scan_inst->src[1].file != IMM) { 2964 /* Fit this constant in by commuting the operands */ 2965 scan_inst->src[0] = scan_inst->src[1]; 2966 scan_inst->src[1] = inst->src[0]; 2967 progress = true; 2968 } 2969 break; 2970 case BRW_OPCODE_CMP: 2971 case BRW_OPCODE_SEL: 2972 if (i == 1) { 2973 scan_inst->src[i] = inst->src[0]; 2974 progress = true; 2975 } 2976 } 2977 } 2978 2979 if (scan_inst->dst.file == GRF && 2980 scan_inst->dst.reg == inst->dst.reg && 2981 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 2982 scan_inst->is_tex())) { 2983 break; 2984 } 2985 } 2986 } 2987 2988 if (progress) 2989 this->live_intervals_valid = false; 2990 2991 return progress; 2992} 2993/** 2994 * Must be called after calculate_live_intervales() to remove unused 2995 * writes to registers -- register allocation will fail otherwise 2996 * because something deffed but not used won't be considered to 2997 * interfere with other regs. 2998 */ 2999bool 3000fs_visitor::dead_code_eliminate() 3001{ 3002 bool progress = false; 3003 int pc = 0; 3004 3005 calculate_live_intervals(); 3006 3007 foreach_iter(exec_list_iterator, iter, this->instructions) { 3008 fs_inst *inst = (fs_inst *)iter.get(); 3009 3010 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) { 3011 inst->remove(); 3012 progress = true; 3013 } 3014 3015 pc++; 3016 } 3017 3018 if (progress) 3019 live_intervals_valid = false; 3020 3021 return progress; 3022} 3023 3024bool 3025fs_visitor::register_coalesce() 3026{ 3027 bool progress = false; 3028 int if_depth = 0; 3029 int loop_depth = 0; 3030 3031 foreach_iter(exec_list_iterator, iter, this->instructions) { 3032 fs_inst *inst = (fs_inst *)iter.get(); 3033 3034 /* Make sure that we dominate the instructions we're going to 3035 * scan for interfering with our coalescing, or we won't have 3036 * scanned enough to see if anything interferes with our 3037 * coalescing. We don't dominate the following instructions if 3038 * we're in a loop or an if block. 3039 */ 3040 switch (inst->opcode) { 3041 case BRW_OPCODE_DO: 3042 loop_depth++; 3043 break; 3044 case BRW_OPCODE_WHILE: 3045 loop_depth--; 3046 break; 3047 case BRW_OPCODE_IF: 3048 if_depth++; 3049 break; 3050 case BRW_OPCODE_ENDIF: 3051 if_depth--; 3052 break; 3053 } 3054 if (loop_depth || if_depth) 3055 continue; 3056 3057 if (inst->opcode != BRW_OPCODE_MOV || 3058 inst->predicated || 3059 inst->saturate || 3060 inst->dst.file != GRF || inst->src[0].file != GRF || 3061 inst->dst.type != inst->src[0].type) 3062 continue; 3063 3064 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate; 3065 3066 /* Found a move of a GRF to a GRF. Let's see if we can coalesce 3067 * them: check for no writes to either one until the exit of the 3068 * program. 3069 */ 3070 bool interfered = false; 3071 exec_list_iterator scan_iter = iter; 3072 scan_iter.next(); 3073 for (; scan_iter.has_next(); scan_iter.next()) { 3074 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 3075 3076 if (scan_inst->dst.file == GRF) { 3077 if (scan_inst->dst.reg == inst->dst.reg && 3078 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 3079 scan_inst->is_tex())) { 3080 interfered = true; 3081 break; 3082 } 3083 if (scan_inst->dst.reg == inst->src[0].reg && 3084 (scan_inst->dst.reg_offset == inst->src[0].reg_offset || 3085 scan_inst->is_tex())) { 3086 interfered = true; 3087 break; 3088 } 3089 } 3090 3091 /* The gen6 MATH instruction can't handle source modifiers, so avoid 3092 * coalescing those for now. We should do something more specific. 3093 */ 3094 if (intel->gen == 6 && scan_inst->is_math() && has_source_modifiers) { 3095 interfered = true; 3096 break; 3097 } 3098 } 3099 if (interfered) { 3100 continue; 3101 } 3102 3103 /* Rewrite the later usage to point at the source of the move to 3104 * be removed. 3105 */ 3106 for (exec_list_iterator scan_iter = iter; scan_iter.has_next(); 3107 scan_iter.next()) { 3108 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 3109 3110 for (int i = 0; i < 3; i++) { 3111 if (scan_inst->src[i].file == GRF && 3112 scan_inst->src[i].reg == inst->dst.reg && 3113 scan_inst->src[i].reg_offset == inst->dst.reg_offset) { 3114 scan_inst->src[i].reg = inst->src[0].reg; 3115 scan_inst->src[i].reg_offset = inst->src[0].reg_offset; 3116 scan_inst->src[i].abs |= inst->src[0].abs; 3117 scan_inst->src[i].negate ^= inst->src[0].negate; 3118 scan_inst->src[i].smear = inst->src[0].smear; 3119 } 3120 } 3121 } 3122 3123 inst->remove(); 3124 progress = true; 3125 } 3126 3127 if (progress) 3128 live_intervals_valid = false; 3129 3130 return progress; 3131} 3132 3133 3134bool 3135fs_visitor::compute_to_mrf() 3136{ 3137 bool progress = false; 3138 int next_ip = 0; 3139 3140 calculate_live_intervals(); 3141 3142 foreach_iter(exec_list_iterator, iter, this->instructions) { 3143 fs_inst *inst = (fs_inst *)iter.get(); 3144 3145 int ip = next_ip; 3146 next_ip++; 3147 3148 if (inst->opcode != BRW_OPCODE_MOV || 3149 inst->predicated || 3150 inst->dst.file != MRF || inst->src[0].file != GRF || 3151 inst->dst.type != inst->src[0].type || 3152 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1) 3153 continue; 3154 3155 /* Can't compute-to-MRF this GRF if someone else was going to 3156 * read it later. 3157 */ 3158 if (this->virtual_grf_use[inst->src[0].reg] > ip) 3159 continue; 3160 3161 /* Found a move of a GRF to a MRF. Let's see if we can go 3162 * rewrite the thing that made this GRF to write into the MRF. 3163 */ 3164 fs_inst *scan_inst; 3165 for (scan_inst = (fs_inst *)inst->prev; 3166 scan_inst->prev != NULL; 3167 scan_inst = (fs_inst *)scan_inst->prev) { 3168 if (scan_inst->dst.file == GRF && 3169 scan_inst->dst.reg == inst->src[0].reg) { 3170 /* Found the last thing to write our reg we want to turn 3171 * into a compute-to-MRF. 3172 */ 3173 3174 if (scan_inst->is_tex()) { 3175 /* texturing writes several continuous regs, so we can't 3176 * compute-to-mrf that. 3177 */ 3178 break; 3179 } 3180 3181 /* If it's predicated, it (probably) didn't populate all 3182 * the channels. 3183 */ 3184 if (scan_inst->predicated) 3185 break; 3186 3187 /* SEND instructions can't have MRF as a destination. */ 3188 if (scan_inst->mlen) 3189 break; 3190 3191 if (intel->gen >= 6) { 3192 /* gen6 math instructions must have the destination be 3193 * GRF, so no compute-to-MRF for them. 3194 */ 3195 if (scan_inst->is_math()) { 3196 break; 3197 } 3198 } 3199 3200 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 3201 /* Found the creator of our MRF's source value. */ 3202 scan_inst->dst.file = MRF; 3203 scan_inst->dst.hw_reg = inst->dst.hw_reg; 3204 scan_inst->saturate |= inst->saturate; 3205 inst->remove(); 3206 progress = true; 3207 } 3208 break; 3209 } 3210 3211 /* We don't handle flow control here. Most computation of 3212 * values that end up in MRFs are shortly before the MRF 3213 * write anyway. 3214 */ 3215 if (scan_inst->opcode == BRW_OPCODE_DO || 3216 scan_inst->opcode == BRW_OPCODE_WHILE || 3217 scan_inst->opcode == BRW_OPCODE_ELSE || 3218 scan_inst->opcode == BRW_OPCODE_ENDIF) { 3219 break; 3220 } 3221 3222 /* You can't read from an MRF, so if someone else reads our 3223 * MRF's source GRF that we wanted to rewrite, that stops us. 3224 */ 3225 bool interfered = false; 3226 for (int i = 0; i < 3; i++) { 3227 if (scan_inst->src[i].file == GRF && 3228 scan_inst->src[i].reg == inst->src[0].reg && 3229 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { 3230 interfered = true; 3231 } 3232 } 3233 if (interfered) 3234 break; 3235 3236 if (scan_inst->dst.file == MRF && 3237 scan_inst->dst.hw_reg == inst->dst.hw_reg) { 3238 /* Somebody else wrote our MRF here, so we can't can't 3239 * compute-to-MRF before that. 3240 */ 3241 break; 3242 } 3243 3244 if (scan_inst->mlen > 0) { 3245 /* Found a SEND instruction, which means that there are 3246 * live values in MRFs from base_mrf to base_mrf + 3247 * scan_inst->mlen - 1. Don't go pushing our MRF write up 3248 * above it. 3249 */ 3250 if (inst->dst.hw_reg >= scan_inst->base_mrf && 3251 inst->dst.hw_reg < scan_inst->base_mrf + scan_inst->mlen) { 3252 break; 3253 } 3254 } 3255 } 3256 } 3257 3258 return progress; 3259} 3260 3261/** 3262 * Walks through basic blocks, locking for repeated MRF writes and 3263 * removing the later ones. 3264 */ 3265bool 3266fs_visitor::remove_duplicate_mrf_writes() 3267{ 3268 fs_inst *last_mrf_move[16]; 3269 bool progress = false; 3270 3271 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 3272 3273 foreach_iter(exec_list_iterator, iter, this->instructions) { 3274 fs_inst *inst = (fs_inst *)iter.get(); 3275 3276 switch (inst->opcode) { 3277 case BRW_OPCODE_DO: 3278 case BRW_OPCODE_WHILE: 3279 case BRW_OPCODE_IF: 3280 case BRW_OPCODE_ELSE: 3281 case BRW_OPCODE_ENDIF: 3282 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 3283 continue; 3284 default: 3285 break; 3286 } 3287 3288 if (inst->opcode == BRW_OPCODE_MOV && 3289 inst->dst.file == MRF) { 3290 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg]; 3291 if (prev_inst && inst->equals(prev_inst)) { 3292 inst->remove(); 3293 progress = true; 3294 continue; 3295 } 3296 } 3297 3298 /* Clear out the last-write records for MRFs that were overwritten. */ 3299 if (inst->dst.file == MRF) { 3300 last_mrf_move[inst->dst.hw_reg] = NULL; 3301 } 3302 3303 if (inst->mlen > 0) { 3304 /* Found a SEND instruction, which will include two or fewer 3305 * implied MRF writes. We could do better here. 3306 */ 3307 for (int i = 0; i < implied_mrf_writes(inst); i++) { 3308 last_mrf_move[inst->base_mrf + i] = NULL; 3309 } 3310 } 3311 3312 /* Clear out any MRF move records whose sources got overwritten. */ 3313 if (inst->dst.file == GRF) { 3314 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) { 3315 if (last_mrf_move[i] && 3316 last_mrf_move[i]->src[0].reg == inst->dst.reg) { 3317 last_mrf_move[i] = NULL; 3318 } 3319 } 3320 } 3321 3322 if (inst->opcode == BRW_OPCODE_MOV && 3323 inst->dst.file == MRF && 3324 inst->src[0].file == GRF && 3325 !inst->predicated) { 3326 last_mrf_move[inst->dst.hw_reg] = inst; 3327 } 3328 } 3329 3330 return progress; 3331} 3332 3333bool 3334fs_visitor::virtual_grf_interferes(int a, int b) 3335{ 3336 int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]); 3337 int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]); 3338 3339 /* We can't handle dead register writes here, without iterating 3340 * over the whole instruction stream to find every single dead 3341 * write to that register to compare to the live interval of the 3342 * other register. Just assert that dead_code_eliminate() has been 3343 * called. 3344 */ 3345 assert((this->virtual_grf_use[a] != -1 || 3346 this->virtual_grf_def[a] == MAX_INSTRUCTION) && 3347 (this->virtual_grf_use[b] != -1 || 3348 this->virtual_grf_def[b] == MAX_INSTRUCTION)); 3349 3350 return start < end; 3351} 3352 3353static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg) 3354{ 3355 struct brw_reg brw_reg; 3356 3357 switch (reg->file) { 3358 case GRF: 3359 case ARF: 3360 case MRF: 3361 if (reg->smear == -1) { 3362 brw_reg = brw_vec8_reg(reg->file, 3363 reg->hw_reg, 0); 3364 } else { 3365 brw_reg = brw_vec1_reg(reg->file, 3366 reg->hw_reg, reg->smear); 3367 } 3368 brw_reg = retype(brw_reg, reg->type); 3369 break; 3370 case IMM: 3371 switch (reg->type) { 3372 case BRW_REGISTER_TYPE_F: 3373 brw_reg = brw_imm_f(reg->imm.f); 3374 break; 3375 case BRW_REGISTER_TYPE_D: 3376 brw_reg = brw_imm_d(reg->imm.i); 3377 break; 3378 case BRW_REGISTER_TYPE_UD: 3379 brw_reg = brw_imm_ud(reg->imm.u); 3380 break; 3381 default: 3382 assert(!"not reached"); 3383 brw_reg = brw_null_reg(); 3384 break; 3385 } 3386 break; 3387 case FIXED_HW_REG: 3388 brw_reg = reg->fixed_hw_reg; 3389 break; 3390 case BAD_FILE: 3391 /* Probably unused. */ 3392 brw_reg = brw_null_reg(); 3393 break; 3394 case UNIFORM: 3395 assert(!"not reached"); 3396 brw_reg = brw_null_reg(); 3397 break; 3398 default: 3399 assert(!"not reached"); 3400 brw_reg = brw_null_reg(); 3401 break; 3402 } 3403 if (reg->abs) 3404 brw_reg = brw_abs(brw_reg); 3405 if (reg->negate) 3406 brw_reg = negate(brw_reg); 3407 3408 return brw_reg; 3409} 3410 3411void 3412fs_visitor::generate_code() 3413{ 3414 int last_native_inst = 0; 3415 const char *last_annotation_string = NULL; 3416 ir_instruction *last_annotation_ir = NULL; 3417 3418 int if_stack_array_size = 16; 3419 int loop_stack_array_size = 16; 3420 int if_stack_depth = 0, loop_stack_depth = 0; 3421 brw_instruction **if_stack = 3422 rzalloc_array(this->mem_ctx, brw_instruction *, if_stack_array_size); 3423 brw_instruction **loop_stack = 3424 rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size); 3425 int *if_depth_in_loop = 3426 rzalloc_array(this->mem_ctx, int, loop_stack_array_size); 3427 3428 3429 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3430 printf("Native code for fragment shader %d:\n", 3431 ctx->Shader.CurrentFragmentProgram->Name); 3432 } 3433 3434 foreach_iter(exec_list_iterator, iter, this->instructions) { 3435 fs_inst *inst = (fs_inst *)iter.get(); 3436 struct brw_reg src[3], dst; 3437 3438 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3439 if (last_annotation_ir != inst->ir) { 3440 last_annotation_ir = inst->ir; 3441 if (last_annotation_ir) { 3442 printf(" "); 3443 last_annotation_ir->print(); 3444 printf("\n"); 3445 } 3446 } 3447 if (last_annotation_string != inst->annotation) { 3448 last_annotation_string = inst->annotation; 3449 if (last_annotation_string) 3450 printf(" %s\n", last_annotation_string); 3451 } 3452 } 3453 3454 for (unsigned int i = 0; i < 3; i++) { 3455 src[i] = brw_reg_from_fs_reg(&inst->src[i]); 3456 } 3457 dst = brw_reg_from_fs_reg(&inst->dst); 3458 3459 brw_set_conditionalmod(p, inst->conditional_mod); 3460 brw_set_predicate_control(p, inst->predicated); 3461 brw_set_saturate(p, inst->saturate); 3462 3463 switch (inst->opcode) { 3464 case BRW_OPCODE_MOV: 3465 brw_MOV(p, dst, src[0]); 3466 break; 3467 case BRW_OPCODE_ADD: 3468 brw_ADD(p, dst, src[0], src[1]); 3469 break; 3470 case BRW_OPCODE_MUL: 3471 brw_MUL(p, dst, src[0], src[1]); 3472 break; 3473 3474 case BRW_OPCODE_FRC: 3475 brw_FRC(p, dst, src[0]); 3476 break; 3477 case BRW_OPCODE_RNDD: 3478 brw_RNDD(p, dst, src[0]); 3479 break; 3480 case BRW_OPCODE_RNDE: 3481 brw_RNDE(p, dst, src[0]); 3482 break; 3483 case BRW_OPCODE_RNDZ: 3484 brw_RNDZ(p, dst, src[0]); 3485 break; 3486 3487 case BRW_OPCODE_AND: 3488 brw_AND(p, dst, src[0], src[1]); 3489 break; 3490 case BRW_OPCODE_OR: 3491 brw_OR(p, dst, src[0], src[1]); 3492 break; 3493 case BRW_OPCODE_XOR: 3494 brw_XOR(p, dst, src[0], src[1]); 3495 break; 3496 case BRW_OPCODE_NOT: 3497 brw_NOT(p, dst, src[0]); 3498 break; 3499 case BRW_OPCODE_ASR: 3500 brw_ASR(p, dst, src[0], src[1]); 3501 break; 3502 case BRW_OPCODE_SHR: 3503 brw_SHR(p, dst, src[0], src[1]); 3504 break; 3505 case BRW_OPCODE_SHL: 3506 brw_SHL(p, dst, src[0], src[1]); 3507 break; 3508 3509 case BRW_OPCODE_CMP: 3510 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); 3511 break; 3512 case BRW_OPCODE_SEL: 3513 brw_SEL(p, dst, src[0], src[1]); 3514 break; 3515 3516 case BRW_OPCODE_IF: 3517 if (inst->src[0].file != BAD_FILE) { 3518 assert(intel->gen >= 6); 3519 if_stack[if_stack_depth] = gen6_IF(p, inst->conditional_mod, src[0], src[1]); 3520 } else { 3521 if_stack[if_stack_depth] = brw_IF(p, BRW_EXECUTE_8); 3522 } 3523 if_depth_in_loop[loop_stack_depth]++; 3524 if_stack_depth++; 3525 if (if_stack_array_size <= if_stack_depth) { 3526 if_stack_array_size *= 2; 3527 if_stack = reralloc(this->mem_ctx, if_stack, brw_instruction *, 3528 if_stack_array_size); 3529 } 3530 break; 3531 3532 case BRW_OPCODE_ELSE: 3533 if_stack[if_stack_depth - 1] = 3534 brw_ELSE(p, if_stack[if_stack_depth - 1]); 3535 break; 3536 case BRW_OPCODE_ENDIF: 3537 if_stack_depth--; 3538 brw_ENDIF(p , if_stack[if_stack_depth]); 3539 if_depth_in_loop[loop_stack_depth]--; 3540 break; 3541 3542 case BRW_OPCODE_DO: 3543 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8); 3544 if (loop_stack_array_size <= loop_stack_depth) { 3545 loop_stack_array_size *= 2; 3546 loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *, 3547 loop_stack_array_size); 3548 if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int, 3549 loop_stack_array_size); 3550 } 3551 if_depth_in_loop[loop_stack_depth] = 0; 3552 break; 3553 3554 case BRW_OPCODE_BREAK: 3555 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]); 3556 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 3557 break; 3558 case BRW_OPCODE_CONTINUE: 3559 /* FINISHME: We need to write the loop instruction support still. */ 3560 if (intel->gen >= 6) 3561 gen6_CONT(p, loop_stack[loop_stack_depth - 1]); 3562 else 3563 brw_CONT(p, if_depth_in_loop[loop_stack_depth]); 3564 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 3565 break; 3566 3567 case BRW_OPCODE_WHILE: { 3568 struct brw_instruction *inst0, *inst1; 3569 GLuint br = 1; 3570 3571 if (intel->gen >= 5) 3572 br = 2; 3573 3574 assert(loop_stack_depth > 0); 3575 loop_stack_depth--; 3576 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]); 3577 if (intel->gen < 6) { 3578 /* patch all the BREAK/CONT instructions from last BGNLOOP */ 3579 while (inst0 > loop_stack[loop_stack_depth]) { 3580 inst0--; 3581 if (inst0->header.opcode == BRW_OPCODE_BREAK && 3582 inst0->bits3.if_else.jump_count == 0) { 3583 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1); 3584 } 3585 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE && 3586 inst0->bits3.if_else.jump_count == 0) { 3587 inst0->bits3.if_else.jump_count = br * (inst1 - inst0); 3588 } 3589 } 3590 } 3591 } 3592 break; 3593 3594 case FS_OPCODE_RCP: 3595 case FS_OPCODE_RSQ: 3596 case FS_OPCODE_SQRT: 3597 case FS_OPCODE_EXP2: 3598 case FS_OPCODE_LOG2: 3599 case FS_OPCODE_POW: 3600 case FS_OPCODE_SIN: 3601 case FS_OPCODE_COS: 3602 generate_math(inst, dst, src); 3603 break; 3604 case FS_OPCODE_CINTERP: 3605 brw_MOV(p, dst, src[0]); 3606 break; 3607 case FS_OPCODE_LINTERP: 3608 generate_linterp(inst, dst, src); 3609 break; 3610 case FS_OPCODE_TEX: 3611 case FS_OPCODE_TXB: 3612 case FS_OPCODE_TXD: 3613 case FS_OPCODE_TXL: 3614 generate_tex(inst, dst, src[0]); 3615 break; 3616 case FS_OPCODE_DISCARD_NOT: 3617 generate_discard_not(inst, dst); 3618 break; 3619 case FS_OPCODE_DISCARD_AND: 3620 generate_discard_and(inst, src[0]); 3621 break; 3622 case FS_OPCODE_DDX: 3623 generate_ddx(inst, dst, src[0]); 3624 break; 3625 case FS_OPCODE_DDY: 3626 generate_ddy(inst, dst, src[0]); 3627 break; 3628 3629 case FS_OPCODE_SPILL: 3630 generate_spill(inst, src[0]); 3631 break; 3632 3633 case FS_OPCODE_UNSPILL: 3634 generate_unspill(inst, dst); 3635 break; 3636 3637 case FS_OPCODE_PULL_CONSTANT_LOAD: 3638 generate_pull_constant_load(inst, dst); 3639 break; 3640 3641 case FS_OPCODE_FB_WRITE: 3642 generate_fb_write(inst); 3643 break; 3644 default: 3645 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) { 3646 _mesa_problem(ctx, "Unsupported opcode `%s' in FS", 3647 brw_opcodes[inst->opcode].name); 3648 } else { 3649 _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode); 3650 } 3651 fail("unsupported opcode in FS\n"); 3652 } 3653 3654 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3655 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) { 3656 if (0) { 3657 printf("0x%08x 0x%08x 0x%08x 0x%08x ", 3658 ((uint32_t *)&p->store[i])[3], 3659 ((uint32_t *)&p->store[i])[2], 3660 ((uint32_t *)&p->store[i])[1], 3661 ((uint32_t *)&p->store[i])[0]); 3662 } 3663 brw_disasm(stdout, &p->store[i], intel->gen); 3664 } 3665 } 3666 3667 last_native_inst = p->nr_insn; 3668 } 3669 3670 ralloc_free(if_stack); 3671 ralloc_free(loop_stack); 3672 ralloc_free(if_depth_in_loop); 3673 3674 brw_set_uip_jip(p); 3675 3676 /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS 3677 * emit issues, it doesn't get the jump distances into the output, 3678 * which is often something we want to debug. So this is here in 3679 * case you're doing that. 3680 */ 3681 if (0) { 3682 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3683 for (unsigned int i = 0; i < p->nr_insn; i++) { 3684 printf("0x%08x 0x%08x 0x%08x 0x%08x ", 3685 ((uint32_t *)&p->store[i])[3], 3686 ((uint32_t *)&p->store[i])[2], 3687 ((uint32_t *)&p->store[i])[1], 3688 ((uint32_t *)&p->store[i])[0]); 3689 brw_disasm(stdout, &p->store[i], intel->gen); 3690 } 3691 } 3692 } 3693} 3694 3695GLboolean 3696brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c) 3697{ 3698 struct intel_context *intel = &brw->intel; 3699 struct gl_context *ctx = &intel->ctx; 3700 struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram; 3701 3702 if (!prog) 3703 return GL_FALSE; 3704 3705 struct brw_shader *shader = 3706 (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; 3707 if (!shader) 3708 return GL_FALSE; 3709 3710 /* We always use 8-wide mode, at least for now. For one, flow 3711 * control only works in 8-wide. Also, when we're fragment shader 3712 * bound, we're almost always under register pressure as well, so 3713 * 8-wide would save us from the performance cliff of spilling 3714 * regs. 3715 */ 3716 c->dispatch_width = 8; 3717 3718 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3719 printf("GLSL IR for native fragment shader %d:\n", prog->Name); 3720 _mesa_print_ir(shader->ir, NULL); 3721 printf("\n"); 3722 } 3723 3724 /* Now the main event: Visit the shader IR and generate our FS IR for it. 3725 */ 3726 fs_visitor v(c, shader); 3727 3728 if (0) { 3729 v.emit_dummy_fs(); 3730 } else { 3731 v.calculate_urb_setup(); 3732 if (intel->gen < 6) 3733 v.emit_interpolation_setup_gen4(); 3734 else 3735 v.emit_interpolation_setup_gen6(); 3736 3737 /* Generate FS IR for main(). (the visitor only descends into 3738 * functions called "main"). 3739 */ 3740 foreach_iter(exec_list_iterator, iter, *shader->ir) { 3741 ir_instruction *ir = (ir_instruction *)iter.get(); 3742 v.base_ir = ir; 3743 ir->accept(&v); 3744 } 3745 3746 v.emit_fb_writes(); 3747 3748 v.split_virtual_grfs(); 3749 3750 v.setup_paramvalues_refs(); 3751 v.setup_pull_constants(); 3752 3753 bool progress; 3754 do { 3755 progress = false; 3756 3757 progress = v.remove_duplicate_mrf_writes() || progress; 3758 3759 progress = v.propagate_constants() || progress; 3760 progress = v.register_coalesce() || progress; 3761 progress = v.compute_to_mrf() || progress; 3762 progress = v.dead_code_eliminate() || progress; 3763 } while (progress); 3764 3765 v.schedule_instructions(); 3766 3767 v.assign_curb_setup(); 3768 v.assign_urb_setup(); 3769 3770 if (0) { 3771 /* Debug of register spilling: Go spill everything. */ 3772 int virtual_grf_count = v.virtual_grf_next; 3773 for (int i = 1; i < virtual_grf_count; i++) { 3774 v.spill_reg(i); 3775 } 3776 } 3777 3778 if (0) 3779 v.assign_regs_trivial(); 3780 else { 3781 while (!v.assign_regs()) { 3782 if (v.failed) 3783 break; 3784 } 3785 } 3786 } 3787 3788 if (!v.failed) 3789 v.generate_code(); 3790 3791 assert(!v.failed); /* FINISHME: Cleanly fail, tested at link time, etc. */ 3792 3793 if (v.failed) 3794 return GL_FALSE; 3795 3796 c->prog_data.total_grf = v.grf_used; 3797 3798 return GL_TRUE; 3799} 3800