brw_fs.cpp revision b943b9b1a696cf51adfb2a18bcb9cf503fb2737f
1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Eric Anholt <eric@anholt.net> 25 * 26 */ 27 28extern "C" { 29 30#include <sys/types.h> 31 32#include "main/macros.h" 33#include "main/shaderobj.h" 34#include "main/uniforms.h" 35#include "program/prog_parameter.h" 36#include "program/prog_print.h" 37#include "program/prog_optimize.h" 38#include "program/register_allocate.h" 39#include "program/sampler.h" 40#include "program/hash_table.h" 41#include "brw_context.h" 42#include "brw_eu.h" 43#include "brw_wm.h" 44} 45#include "brw_fs.h" 46#include "../glsl/glsl_types.h" 47#include "../glsl/ir_optimization.h" 48#include "../glsl/ir_print_visitor.h" 49 50#define MAX_INSTRUCTION (1 << 30) 51static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg); 52 53struct gl_shader * 54brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type) 55{ 56 struct brw_shader *shader; 57 58 shader = rzalloc(NULL, struct brw_shader); 59 if (shader) { 60 shader->base.Type = type; 61 shader->base.Name = name; 62 _mesa_init_shader(ctx, &shader->base); 63 } 64 65 return &shader->base; 66} 67 68struct gl_shader_program * 69brw_new_shader_program(struct gl_context *ctx, GLuint name) 70{ 71 struct brw_shader_program *prog; 72 prog = rzalloc(NULL, struct brw_shader_program); 73 if (prog) { 74 prog->base.Name = name; 75 _mesa_init_shader_program(ctx, &prog->base); 76 } 77 return &prog->base; 78} 79 80GLboolean 81brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog) 82{ 83 struct brw_context *brw = brw_context(ctx); 84 struct intel_context *intel = &brw->intel; 85 86 struct brw_shader *shader = 87 (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; 88 if (shader != NULL) { 89 void *mem_ctx = ralloc_context(NULL); 90 bool progress; 91 92 if (shader->ir) 93 ralloc_free(shader->ir); 94 shader->ir = new(shader) exec_list; 95 clone_ir_list(mem_ctx, shader->ir, shader->base.ir); 96 97 do_mat_op_to_vec(shader->ir); 98 lower_instructions(shader->ir, 99 MOD_TO_FRACT | 100 DIV_TO_MUL_RCP | 101 SUB_TO_ADD_NEG | 102 EXP_TO_EXP2 | 103 LOG_TO_LOG2); 104 105 /* Pre-gen6 HW can only nest if-statements 16 deep. Beyond this, 106 * if-statements need to be flattened. 107 */ 108 if (intel->gen < 6) 109 lower_if_to_cond_assign(shader->ir, 16); 110 111 do_lower_texture_projection(shader->ir); 112 do_vec_index_to_cond_assign(shader->ir); 113 brw_do_cubemap_normalize(shader->ir); 114 lower_noise(shader->ir); 115 lower_quadop_vector(shader->ir, false); 116 lower_variable_index_to_cond_assign(shader->ir, 117 GL_TRUE, /* input */ 118 GL_TRUE, /* output */ 119 GL_TRUE, /* temp */ 120 GL_TRUE /* uniform */ 121 ); 122 123 do { 124 progress = false; 125 126 brw_do_channel_expressions(shader->ir); 127 brw_do_vector_splitting(shader->ir); 128 129 progress = do_lower_jumps(shader->ir, true, true, 130 true, /* main return */ 131 false, /* continue */ 132 false /* loops */ 133 ) || progress; 134 135 progress = do_common_optimization(shader->ir, true, 32) || progress; 136 } while (progress); 137 138 validate_ir_tree(shader->ir); 139 140 reparent_ir(shader->ir, shader->ir); 141 ralloc_free(mem_ctx); 142 } 143 144 if (!_mesa_ir_link_shader(ctx, prog)) 145 return GL_FALSE; 146 147 return GL_TRUE; 148} 149 150static int 151type_size(const struct glsl_type *type) 152{ 153 unsigned int size, i; 154 155 switch (type->base_type) { 156 case GLSL_TYPE_UINT: 157 case GLSL_TYPE_INT: 158 case GLSL_TYPE_FLOAT: 159 case GLSL_TYPE_BOOL: 160 return type->components(); 161 case GLSL_TYPE_ARRAY: 162 return type_size(type->fields.array) * type->length; 163 case GLSL_TYPE_STRUCT: 164 size = 0; 165 for (i = 0; i < type->length; i++) { 166 size += type_size(type->fields.structure[i].type); 167 } 168 return size; 169 case GLSL_TYPE_SAMPLER: 170 /* Samplers take up no register space, since they're baked in at 171 * link time. 172 */ 173 return 0; 174 default: 175 assert(!"not reached"); 176 return 0; 177 } 178} 179 180void 181fs_visitor::fail(const char *format, ...) 182{ 183 if (!failed) { 184 failed = true; 185 186 if (INTEL_DEBUG & DEBUG_WM) { 187 fprintf(stderr, "FS compile failed: "); 188 189 va_list va; 190 va_start(va, format); 191 vfprintf(stderr, format, va); 192 va_end(va); 193 } 194 } 195} 196 197void 198fs_visitor::push_force_uncompressed() 199{ 200 force_uncompressed_stack++; 201} 202 203void 204fs_visitor::pop_force_uncompressed() 205{ 206 force_uncompressed_stack--; 207 assert(force_uncompressed_stack >= 0); 208} 209 210void 211fs_visitor::push_force_sechalf() 212{ 213 force_sechalf_stack++; 214} 215 216void 217fs_visitor::pop_force_sechalf() 218{ 219 force_sechalf_stack--; 220 assert(force_sechalf_stack >= 0); 221} 222 223/** 224 * Returns how many MRFs an FS opcode will write over. 225 * 226 * Note that this is not the 0 or 1 implied writes in an actual gen 227 * instruction -- the FS opcodes often generate MOVs in addition. 228 */ 229int 230fs_visitor::implied_mrf_writes(fs_inst *inst) 231{ 232 if (inst->mlen == 0) 233 return 0; 234 235 switch (inst->opcode) { 236 case FS_OPCODE_RCP: 237 case FS_OPCODE_RSQ: 238 case FS_OPCODE_SQRT: 239 case FS_OPCODE_EXP2: 240 case FS_OPCODE_LOG2: 241 case FS_OPCODE_SIN: 242 case FS_OPCODE_COS: 243 return 1 * c->dispatch_width / 8; 244 case FS_OPCODE_POW: 245 return 2 * c->dispatch_width / 8; 246 case FS_OPCODE_TEX: 247 case FS_OPCODE_TXB: 248 case FS_OPCODE_TXD: 249 case FS_OPCODE_TXL: 250 return 1; 251 case FS_OPCODE_FB_WRITE: 252 return 2; 253 case FS_OPCODE_PULL_CONSTANT_LOAD: 254 case FS_OPCODE_UNSPILL: 255 return 1; 256 case FS_OPCODE_SPILL: 257 return 2; 258 default: 259 assert(!"not reached"); 260 return inst->mlen; 261 } 262} 263 264int 265fs_visitor::virtual_grf_alloc(int size) 266{ 267 if (virtual_grf_array_size <= virtual_grf_next) { 268 if (virtual_grf_array_size == 0) 269 virtual_grf_array_size = 16; 270 else 271 virtual_grf_array_size *= 2; 272 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int, 273 virtual_grf_array_size); 274 275 /* This slot is always unused. */ 276 virtual_grf_sizes[0] = 0; 277 } 278 virtual_grf_sizes[virtual_grf_next] = size; 279 return virtual_grf_next++; 280} 281 282/** Fixed HW reg constructor. */ 283fs_reg::fs_reg(enum register_file file, int hw_reg) 284{ 285 init(); 286 this->file = file; 287 this->hw_reg = hw_reg; 288 this->type = BRW_REGISTER_TYPE_F; 289} 290 291/** Fixed HW reg constructor. */ 292fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type) 293{ 294 init(); 295 this->file = file; 296 this->hw_reg = hw_reg; 297 this->type = type; 298} 299 300int 301brw_type_for_base_type(const struct glsl_type *type) 302{ 303 switch (type->base_type) { 304 case GLSL_TYPE_FLOAT: 305 return BRW_REGISTER_TYPE_F; 306 case GLSL_TYPE_INT: 307 case GLSL_TYPE_BOOL: 308 return BRW_REGISTER_TYPE_D; 309 case GLSL_TYPE_UINT: 310 return BRW_REGISTER_TYPE_UD; 311 case GLSL_TYPE_ARRAY: 312 case GLSL_TYPE_STRUCT: 313 case GLSL_TYPE_SAMPLER: 314 /* These should be overridden with the type of the member when 315 * dereferenced into. BRW_REGISTER_TYPE_UD seems like a likely 316 * way to trip up if we don't. 317 */ 318 return BRW_REGISTER_TYPE_UD; 319 default: 320 assert(!"not reached"); 321 return BRW_REGISTER_TYPE_F; 322 } 323} 324 325/** Automatic reg constructor. */ 326fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type) 327{ 328 init(); 329 330 this->file = GRF; 331 this->reg = v->virtual_grf_alloc(type_size(type)); 332 this->reg_offset = 0; 333 this->type = brw_type_for_base_type(type); 334} 335 336fs_reg * 337fs_visitor::variable_storage(ir_variable *var) 338{ 339 return (fs_reg *)hash_table_find(this->variable_ht, var); 340} 341 342/* Our support for uniforms is piggy-backed on the struct 343 * gl_fragment_program, because that's where the values actually 344 * get stored, rather than in some global gl_shader_program uniform 345 * store. 346 */ 347int 348fs_visitor::setup_uniform_values(int loc, const glsl_type *type) 349{ 350 unsigned int offset = 0; 351 352 if (type->is_matrix()) { 353 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT, 354 type->vector_elements, 355 1); 356 357 for (unsigned int i = 0; i < type->matrix_columns; i++) { 358 offset += setup_uniform_values(loc + offset, column); 359 } 360 361 return offset; 362 } 363 364 switch (type->base_type) { 365 case GLSL_TYPE_FLOAT: 366 case GLSL_TYPE_UINT: 367 case GLSL_TYPE_INT: 368 case GLSL_TYPE_BOOL: 369 for (unsigned int i = 0; i < type->vector_elements; i++) { 370 unsigned int param = c->prog_data.nr_params++; 371 372 assert(param < ARRAY_SIZE(c->prog_data.param)); 373 374 switch (type->base_type) { 375 case GLSL_TYPE_FLOAT: 376 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 377 break; 378 case GLSL_TYPE_UINT: 379 c->prog_data.param_convert[param] = PARAM_CONVERT_F2U; 380 break; 381 case GLSL_TYPE_INT: 382 c->prog_data.param_convert[param] = PARAM_CONVERT_F2I; 383 break; 384 case GLSL_TYPE_BOOL: 385 c->prog_data.param_convert[param] = PARAM_CONVERT_F2B; 386 break; 387 default: 388 assert(!"not reached"); 389 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 390 break; 391 } 392 this->param_index[param] = loc; 393 this->param_offset[param] = i; 394 } 395 return 1; 396 397 case GLSL_TYPE_STRUCT: 398 for (unsigned int i = 0; i < type->length; i++) { 399 offset += setup_uniform_values(loc + offset, 400 type->fields.structure[i].type); 401 } 402 return offset; 403 404 case GLSL_TYPE_ARRAY: 405 for (unsigned int i = 0; i < type->length; i++) { 406 offset += setup_uniform_values(loc + offset, type->fields.array); 407 } 408 return offset; 409 410 case GLSL_TYPE_SAMPLER: 411 /* The sampler takes up a slot, but we don't use any values from it. */ 412 return 1; 413 414 default: 415 assert(!"not reached"); 416 return 0; 417 } 418} 419 420 421/* Our support for builtin uniforms is even scarier than non-builtin. 422 * It sits on top of the PROG_STATE_VAR parameters that are 423 * automatically updated from GL context state. 424 */ 425void 426fs_visitor::setup_builtin_uniform_values(ir_variable *ir) 427{ 428 const ir_state_slot *const slots = ir->state_slots; 429 assert(ir->state_slots != NULL); 430 431 for (unsigned int i = 0; i < ir->num_state_slots; i++) { 432 /* This state reference has already been setup by ir_to_mesa, but we'll 433 * get the same index back here. 434 */ 435 int index = _mesa_add_state_reference(this->fp->Base.Parameters, 436 (gl_state_index *)slots[i].tokens); 437 438 /* Add each of the unique swizzles of the element as a parameter. 439 * This'll end up matching the expected layout of the 440 * array/matrix/structure we're trying to fill in. 441 */ 442 int last_swiz = -1; 443 for (unsigned int j = 0; j < 4; j++) { 444 int swiz = GET_SWZ(slots[i].swizzle, j); 445 if (swiz == last_swiz) 446 break; 447 last_swiz = swiz; 448 449 c->prog_data.param_convert[c->prog_data.nr_params] = 450 PARAM_NO_CONVERT; 451 this->param_index[c->prog_data.nr_params] = index; 452 this->param_offset[c->prog_data.nr_params] = swiz; 453 c->prog_data.nr_params++; 454 } 455 } 456} 457 458fs_reg * 459fs_visitor::emit_fragcoord_interpolation(ir_variable *ir) 460{ 461 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 462 fs_reg wpos = *reg; 463 fs_reg neg_y = this->pixel_y; 464 neg_y.negate = true; 465 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo; 466 467 /* gl_FragCoord.x */ 468 if (ir->pixel_center_integer) { 469 emit(BRW_OPCODE_MOV, wpos, this->pixel_x); 470 } else { 471 emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f)); 472 } 473 wpos.reg_offset++; 474 475 /* gl_FragCoord.y */ 476 if (!flip && ir->pixel_center_integer) { 477 emit(BRW_OPCODE_MOV, wpos, this->pixel_y); 478 } else { 479 fs_reg pixel_y = this->pixel_y; 480 float offset = (ir->pixel_center_integer ? 0.0 : 0.5); 481 482 if (flip) { 483 pixel_y.negate = true; 484 offset += c->key.drawable_height - 1.0; 485 } 486 487 emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset)); 488 } 489 wpos.reg_offset++; 490 491 /* gl_FragCoord.z */ 492 if (intel->gen >= 6) { 493 emit(BRW_OPCODE_MOV, wpos, 494 fs_reg(brw_vec8_grf(c->source_depth_reg, 0))); 495 } else { 496 emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y, 497 interp_reg(FRAG_ATTRIB_WPOS, 2)); 498 } 499 wpos.reg_offset++; 500 501 /* gl_FragCoord.w: Already set up in emit_interpolation */ 502 emit(BRW_OPCODE_MOV, wpos, this->wpos_w); 503 504 return reg; 505} 506 507fs_reg * 508fs_visitor::emit_general_interpolation(ir_variable *ir) 509{ 510 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 511 /* Interpolation is always in floating point regs. */ 512 reg->type = BRW_REGISTER_TYPE_F; 513 fs_reg attr = *reg; 514 515 unsigned int array_elements; 516 const glsl_type *type; 517 518 if (ir->type->is_array()) { 519 array_elements = ir->type->length; 520 if (array_elements == 0) { 521 fail("dereferenced array '%s' has length 0\n", ir->name); 522 } 523 type = ir->type->fields.array; 524 } else { 525 array_elements = 1; 526 type = ir->type; 527 } 528 529 int location = ir->location; 530 for (unsigned int i = 0; i < array_elements; i++) { 531 for (unsigned int j = 0; j < type->matrix_columns; j++) { 532 if (urb_setup[location] == -1) { 533 /* If there's no incoming setup data for this slot, don't 534 * emit interpolation for it. 535 */ 536 attr.reg_offset += type->vector_elements; 537 location++; 538 continue; 539 } 540 541 bool is_gl_Color = 542 location == FRAG_ATTRIB_COL0 || location == FRAG_ATTRIB_COL1; 543 544 if (c->key.flat_shade && is_gl_Color) { 545 /* Constant interpolation (flat shading) case. The SF has 546 * handed us defined values in only the constant offset 547 * field of the setup reg. 548 */ 549 for (unsigned int k = 0; k < type->vector_elements; k++) { 550 struct brw_reg interp = interp_reg(location, k); 551 interp = suboffset(interp, 3); 552 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp)); 553 attr.reg_offset++; 554 } 555 } else { 556 /* Perspective interpolation case. */ 557 for (unsigned int k = 0; k < type->vector_elements; k++) { 558 struct brw_reg interp = interp_reg(location, k); 559 emit(FS_OPCODE_LINTERP, attr, 560 this->delta_x, this->delta_y, fs_reg(interp)); 561 attr.reg_offset++; 562 } 563 564 if (intel->gen < 6 && !(is_gl_Color && c->key.linear_color)) { 565 attr.reg_offset -= type->vector_elements; 566 for (unsigned int k = 0; k < type->vector_elements; k++) { 567 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w); 568 attr.reg_offset++; 569 } 570 } 571 } 572 location++; 573 } 574 } 575 576 return reg; 577} 578 579fs_reg * 580fs_visitor::emit_frontfacing_interpolation(ir_variable *ir) 581{ 582 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 583 584 /* The frontfacing comes in as a bit in the thread payload. */ 585 if (intel->gen >= 6) { 586 emit(BRW_OPCODE_ASR, *reg, 587 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)), 588 fs_reg(15)); 589 emit(BRW_OPCODE_NOT, *reg, *reg); 590 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1)); 591 } else { 592 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); 593 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives 594 * us front face 595 */ 596 fs_inst *inst = emit(BRW_OPCODE_CMP, *reg, 597 fs_reg(r1_6ud), 598 fs_reg(1u << 31)); 599 inst->conditional_mod = BRW_CONDITIONAL_L; 600 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u)); 601 } 602 603 return reg; 604} 605 606fs_inst * 607fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src) 608{ 609 switch (opcode) { 610 case FS_OPCODE_RCP: 611 case FS_OPCODE_RSQ: 612 case FS_OPCODE_SQRT: 613 case FS_OPCODE_EXP2: 614 case FS_OPCODE_LOG2: 615 case FS_OPCODE_SIN: 616 case FS_OPCODE_COS: 617 break; 618 default: 619 assert(!"not reached: bad math opcode"); 620 return NULL; 621 } 622 623 /* Can't do hstride == 0 args to gen6 math, so expand it out. We 624 * might be able to do better by doing execsize = 1 math and then 625 * expanding that result out, but we would need to be careful with 626 * masking. 627 * 628 * The hardware ignores source modifiers (negate and abs) on math 629 * instructions, so we also move to a temp to set those up. 630 */ 631 if (intel->gen >= 6 && (src.file == UNIFORM || 632 src.abs || 633 src.negate)) { 634 fs_reg expanded = fs_reg(this, glsl_type::float_type); 635 emit(BRW_OPCODE_MOV, expanded, src); 636 src = expanded; 637 } 638 639 fs_inst *inst = emit(opcode, dst, src); 640 641 if (intel->gen < 6) { 642 inst->base_mrf = 2; 643 inst->mlen = c->dispatch_width / 8; 644 } 645 646 return inst; 647} 648 649fs_inst * 650fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1) 651{ 652 int base_mrf = 2; 653 fs_inst *inst; 654 655 assert(opcode == FS_OPCODE_POW); 656 657 if (intel->gen >= 6) { 658 /* Can't do hstride == 0 args to gen6 math, so expand it out. 659 * 660 * The hardware ignores source modifiers (negate and abs) on math 661 * instructions, so we also move to a temp to set those up. 662 */ 663 if (src0.file == UNIFORM || src0.abs || src0.negate) { 664 fs_reg expanded = fs_reg(this, glsl_type::float_type); 665 emit(BRW_OPCODE_MOV, expanded, src0); 666 src0 = expanded; 667 } 668 669 if (src1.file == UNIFORM || src1.abs || src1.negate) { 670 fs_reg expanded = fs_reg(this, glsl_type::float_type); 671 emit(BRW_OPCODE_MOV, expanded, src1); 672 src1 = expanded; 673 } 674 675 inst = emit(opcode, dst, src0, src1); 676 } else { 677 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1); 678 inst = emit(opcode, dst, src0, reg_null_f); 679 680 inst->base_mrf = base_mrf; 681 inst->mlen = 2 * c->dispatch_width / 8; 682 } 683 return inst; 684} 685 686void 687fs_visitor::visit(ir_variable *ir) 688{ 689 fs_reg *reg = NULL; 690 691 if (variable_storage(ir)) 692 return; 693 694 if (strcmp(ir->name, "gl_FragColor") == 0) { 695 this->frag_color = ir; 696 } else if (strcmp(ir->name, "gl_FragData") == 0) { 697 this->frag_data = ir; 698 } else if (strcmp(ir->name, "gl_FragDepth") == 0) { 699 this->frag_depth = ir; 700 } 701 702 if (ir->mode == ir_var_in) { 703 if (!strcmp(ir->name, "gl_FragCoord")) { 704 reg = emit_fragcoord_interpolation(ir); 705 } else if (!strcmp(ir->name, "gl_FrontFacing")) { 706 reg = emit_frontfacing_interpolation(ir); 707 } else { 708 reg = emit_general_interpolation(ir); 709 } 710 assert(reg); 711 hash_table_insert(this->variable_ht, reg, ir); 712 return; 713 } 714 715 if (ir->mode == ir_var_uniform) { 716 int param_index = c->prog_data.nr_params; 717 718 if (!strncmp(ir->name, "gl_", 3)) { 719 setup_builtin_uniform_values(ir); 720 } else { 721 setup_uniform_values(ir->location, ir->type); 722 } 723 724 reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index); 725 reg->type = brw_type_for_base_type(ir->type); 726 } 727 728 if (!reg) 729 reg = new(this->mem_ctx) fs_reg(this, ir->type); 730 731 hash_table_insert(this->variable_ht, reg, ir); 732} 733 734void 735fs_visitor::visit(ir_dereference_variable *ir) 736{ 737 fs_reg *reg = variable_storage(ir->var); 738 this->result = *reg; 739} 740 741void 742fs_visitor::visit(ir_dereference_record *ir) 743{ 744 const glsl_type *struct_type = ir->record->type; 745 746 ir->record->accept(this); 747 748 unsigned int offset = 0; 749 for (unsigned int i = 0; i < struct_type->length; i++) { 750 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0) 751 break; 752 offset += type_size(struct_type->fields.structure[i].type); 753 } 754 this->result.reg_offset += offset; 755 this->result.type = brw_type_for_base_type(ir->type); 756} 757 758void 759fs_visitor::visit(ir_dereference_array *ir) 760{ 761 ir_constant *index; 762 int element_size; 763 764 ir->array->accept(this); 765 index = ir->array_index->as_constant(); 766 767 element_size = type_size(ir->type); 768 this->result.type = brw_type_for_base_type(ir->type); 769 770 if (index) { 771 assert(this->result.file == UNIFORM || 772 (this->result.file == GRF && 773 this->result.reg != 0)); 774 this->result.reg_offset += index->value.i[0] * element_size; 775 } else { 776 assert(!"FINISHME: non-constant array element"); 777 } 778} 779 780/* Instruction selection: Produce a MOV.sat instead of 781 * MIN(MAX(val, 0), 1) when possible. 782 */ 783bool 784fs_visitor::try_emit_saturate(ir_expression *ir) 785{ 786 ir_rvalue *sat_val = ir->as_rvalue_to_saturate(); 787 788 if (!sat_val) 789 return false; 790 791 sat_val->accept(this); 792 fs_reg src = this->result; 793 794 this->result = fs_reg(this, ir->type); 795 fs_inst *inst = emit(BRW_OPCODE_MOV, this->result, src); 796 inst->saturate = true; 797 798 return true; 799} 800 801static uint32_t 802brw_conditional_for_comparison(unsigned int op) 803{ 804 switch (op) { 805 case ir_binop_less: 806 return BRW_CONDITIONAL_L; 807 case ir_binop_greater: 808 return BRW_CONDITIONAL_G; 809 case ir_binop_lequal: 810 return BRW_CONDITIONAL_LE; 811 case ir_binop_gequal: 812 return BRW_CONDITIONAL_GE; 813 case ir_binop_equal: 814 case ir_binop_all_equal: /* same as equal for scalars */ 815 return BRW_CONDITIONAL_Z; 816 case ir_binop_nequal: 817 case ir_binop_any_nequal: /* same as nequal for scalars */ 818 return BRW_CONDITIONAL_NZ; 819 default: 820 assert(!"not reached: bad operation for comparison"); 821 return BRW_CONDITIONAL_NZ; 822 } 823} 824 825void 826fs_visitor::visit(ir_expression *ir) 827{ 828 unsigned int operand; 829 fs_reg op[2], temp; 830 fs_inst *inst; 831 832 assert(ir->get_num_operands() <= 2); 833 834 if (try_emit_saturate(ir)) 835 return; 836 837 for (operand = 0; operand < ir->get_num_operands(); operand++) { 838 ir->operands[operand]->accept(this); 839 if (this->result.file == BAD_FILE) { 840 ir_print_visitor v; 841 fail("Failed to get tree for expression operand:\n"); 842 ir->operands[operand]->accept(&v); 843 } 844 op[operand] = this->result; 845 846 /* Matrix expression operands should have been broken down to vector 847 * operations already. 848 */ 849 assert(!ir->operands[operand]->type->is_matrix()); 850 /* And then those vector operands should have been broken down to scalar. 851 */ 852 assert(!ir->operands[operand]->type->is_vector()); 853 } 854 855 /* Storage for our result. If our result goes into an assignment, it will 856 * just get copy-propagated out, so no worries. 857 */ 858 this->result = fs_reg(this, ir->type); 859 860 switch (ir->operation) { 861 case ir_unop_logic_not: 862 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is 863 * ones complement of the whole register, not just bit 0. 864 */ 865 emit(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1)); 866 break; 867 case ir_unop_neg: 868 op[0].negate = !op[0].negate; 869 this->result = op[0]; 870 break; 871 case ir_unop_abs: 872 op[0].abs = true; 873 op[0].negate = false; 874 this->result = op[0]; 875 break; 876 case ir_unop_sign: 877 temp = fs_reg(this, ir->type); 878 879 emit(BRW_OPCODE_MOV, this->result, fs_reg(0.0f)); 880 881 inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f)); 882 inst->conditional_mod = BRW_CONDITIONAL_G; 883 inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(1.0f)); 884 inst->predicated = true; 885 886 inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f)); 887 inst->conditional_mod = BRW_CONDITIONAL_L; 888 inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f)); 889 inst->predicated = true; 890 891 break; 892 case ir_unop_rcp: 893 emit_math(FS_OPCODE_RCP, this->result, op[0]); 894 break; 895 896 case ir_unop_exp2: 897 emit_math(FS_OPCODE_EXP2, this->result, op[0]); 898 break; 899 case ir_unop_log2: 900 emit_math(FS_OPCODE_LOG2, this->result, op[0]); 901 break; 902 case ir_unop_exp: 903 case ir_unop_log: 904 assert(!"not reached: should be handled by ir_explog_to_explog2"); 905 break; 906 case ir_unop_sin: 907 case ir_unop_sin_reduced: 908 emit_math(FS_OPCODE_SIN, this->result, op[0]); 909 break; 910 case ir_unop_cos: 911 case ir_unop_cos_reduced: 912 emit_math(FS_OPCODE_COS, this->result, op[0]); 913 break; 914 915 case ir_unop_dFdx: 916 emit(FS_OPCODE_DDX, this->result, op[0]); 917 break; 918 case ir_unop_dFdy: 919 emit(FS_OPCODE_DDY, this->result, op[0]); 920 break; 921 922 case ir_binop_add: 923 emit(BRW_OPCODE_ADD, this->result, op[0], op[1]); 924 break; 925 case ir_binop_sub: 926 assert(!"not reached: should be handled by ir_sub_to_add_neg"); 927 break; 928 929 case ir_binop_mul: 930 emit(BRW_OPCODE_MUL, this->result, op[0], op[1]); 931 break; 932 case ir_binop_div: 933 assert(!"not reached: should be handled by ir_div_to_mul_rcp"); 934 break; 935 case ir_binop_mod: 936 assert(!"ir_binop_mod should have been converted to b * fract(a/b)"); 937 break; 938 939 case ir_binop_less: 940 case ir_binop_greater: 941 case ir_binop_lequal: 942 case ir_binop_gequal: 943 case ir_binop_equal: 944 case ir_binop_all_equal: 945 case ir_binop_nequal: 946 case ir_binop_any_nequal: 947 temp = this->result; 948 /* original gen4 does implicit conversion before comparison. */ 949 if (intel->gen < 5) 950 temp.type = op[0].type; 951 952 inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]); 953 inst->conditional_mod = brw_conditional_for_comparison(ir->operation); 954 emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)); 955 break; 956 957 case ir_binop_logic_xor: 958 emit(BRW_OPCODE_XOR, this->result, op[0], op[1]); 959 break; 960 961 case ir_binop_logic_or: 962 emit(BRW_OPCODE_OR, this->result, op[0], op[1]); 963 break; 964 965 case ir_binop_logic_and: 966 emit(BRW_OPCODE_AND, this->result, op[0], op[1]); 967 break; 968 969 case ir_binop_dot: 970 case ir_unop_any: 971 assert(!"not reached: should be handled by brw_fs_channel_expressions"); 972 break; 973 974 case ir_unop_noise: 975 assert(!"not reached: should be handled by lower_noise"); 976 break; 977 978 case ir_quadop_vector: 979 assert(!"not reached: should be handled by lower_quadop_vector"); 980 break; 981 982 case ir_unop_sqrt: 983 emit_math(FS_OPCODE_SQRT, this->result, op[0]); 984 break; 985 986 case ir_unop_rsq: 987 emit_math(FS_OPCODE_RSQ, this->result, op[0]); 988 break; 989 990 case ir_unop_i2f: 991 case ir_unop_b2f: 992 case ir_unop_b2i: 993 case ir_unop_f2i: 994 emit(BRW_OPCODE_MOV, this->result, op[0]); 995 break; 996 case ir_unop_f2b: 997 case ir_unop_i2b: 998 temp = this->result; 999 /* original gen4 does implicit conversion before comparison. */ 1000 if (intel->gen < 5) 1001 temp.type = op[0].type; 1002 1003 inst = emit(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f)); 1004 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1005 inst = emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1)); 1006 break; 1007 1008 case ir_unop_trunc: 1009 emit(BRW_OPCODE_RNDZ, this->result, op[0]); 1010 break; 1011 case ir_unop_ceil: 1012 op[0].negate = !op[0].negate; 1013 inst = emit(BRW_OPCODE_RNDD, this->result, op[0]); 1014 this->result.negate = true; 1015 break; 1016 case ir_unop_floor: 1017 inst = emit(BRW_OPCODE_RNDD, this->result, op[0]); 1018 break; 1019 case ir_unop_fract: 1020 inst = emit(BRW_OPCODE_FRC, this->result, op[0]); 1021 break; 1022 case ir_unop_round_even: 1023 emit(BRW_OPCODE_RNDE, this->result, op[0]); 1024 break; 1025 1026 case ir_binop_min: 1027 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]); 1028 inst->conditional_mod = BRW_CONDITIONAL_L; 1029 1030 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]); 1031 inst->predicated = true; 1032 break; 1033 case ir_binop_max: 1034 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]); 1035 inst->conditional_mod = BRW_CONDITIONAL_G; 1036 1037 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]); 1038 inst->predicated = true; 1039 break; 1040 1041 case ir_binop_pow: 1042 emit_math(FS_OPCODE_POW, this->result, op[0], op[1]); 1043 break; 1044 1045 case ir_unop_bit_not: 1046 inst = emit(BRW_OPCODE_NOT, this->result, op[0]); 1047 break; 1048 case ir_binop_bit_and: 1049 inst = emit(BRW_OPCODE_AND, this->result, op[0], op[1]); 1050 break; 1051 case ir_binop_bit_xor: 1052 inst = emit(BRW_OPCODE_XOR, this->result, op[0], op[1]); 1053 break; 1054 case ir_binop_bit_or: 1055 inst = emit(BRW_OPCODE_OR, this->result, op[0], op[1]); 1056 break; 1057 1058 case ir_unop_u2f: 1059 case ir_binop_lshift: 1060 case ir_binop_rshift: 1061 assert(!"GLSL 1.30 features unsupported"); 1062 break; 1063 } 1064} 1065 1066void 1067fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r, 1068 const glsl_type *type, bool predicated) 1069{ 1070 switch (type->base_type) { 1071 case GLSL_TYPE_FLOAT: 1072 case GLSL_TYPE_UINT: 1073 case GLSL_TYPE_INT: 1074 case GLSL_TYPE_BOOL: 1075 for (unsigned int i = 0; i < type->components(); i++) { 1076 l.type = brw_type_for_base_type(type); 1077 r.type = brw_type_for_base_type(type); 1078 1079 fs_inst *inst = emit(BRW_OPCODE_MOV, l, r); 1080 inst->predicated = predicated; 1081 1082 l.reg_offset++; 1083 r.reg_offset++; 1084 } 1085 break; 1086 case GLSL_TYPE_ARRAY: 1087 for (unsigned int i = 0; i < type->length; i++) { 1088 emit_assignment_writes(l, r, type->fields.array, predicated); 1089 } 1090 break; 1091 1092 case GLSL_TYPE_STRUCT: 1093 for (unsigned int i = 0; i < type->length; i++) { 1094 emit_assignment_writes(l, r, type->fields.structure[i].type, 1095 predicated); 1096 } 1097 break; 1098 1099 case GLSL_TYPE_SAMPLER: 1100 break; 1101 1102 default: 1103 assert(!"not reached"); 1104 break; 1105 } 1106} 1107 1108void 1109fs_visitor::visit(ir_assignment *ir) 1110{ 1111 struct fs_reg l, r; 1112 fs_inst *inst; 1113 1114 /* FINISHME: arrays on the lhs */ 1115 ir->lhs->accept(this); 1116 l = this->result; 1117 1118 ir->rhs->accept(this); 1119 r = this->result; 1120 1121 assert(l.file != BAD_FILE); 1122 assert(r.file != BAD_FILE); 1123 1124 if (ir->condition) { 1125 emit_bool_to_cond_code(ir->condition); 1126 } 1127 1128 if (ir->lhs->type->is_scalar() || 1129 ir->lhs->type->is_vector()) { 1130 for (int i = 0; i < ir->lhs->type->vector_elements; i++) { 1131 if (ir->write_mask & (1 << i)) { 1132 inst = emit(BRW_OPCODE_MOV, l, r); 1133 if (ir->condition) 1134 inst->predicated = true; 1135 r.reg_offset++; 1136 } 1137 l.reg_offset++; 1138 } 1139 } else { 1140 emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL); 1141 } 1142} 1143 1144fs_inst * 1145fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate) 1146{ 1147 int mlen; 1148 int base_mrf = 1; 1149 bool simd16 = false; 1150 fs_reg orig_dst; 1151 1152 /* g0 header. */ 1153 mlen = 1; 1154 1155 if (ir->shadow_comparitor) { 1156 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1157 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate); 1158 coordinate.reg_offset++; 1159 } 1160 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */ 1161 mlen += 3; 1162 1163 if (ir->op == ir_tex) { 1164 /* There's no plain shadow compare message, so we use shadow 1165 * compare with a bias of 0.0. 1166 */ 1167 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)); 1168 mlen++; 1169 } else if (ir->op == ir_txb) { 1170 ir->lod_info.bias->accept(this); 1171 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1172 mlen++; 1173 } else { 1174 assert(ir->op == ir_txl); 1175 ir->lod_info.lod->accept(this); 1176 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1177 mlen++; 1178 } 1179 1180 ir->shadow_comparitor->accept(this); 1181 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1182 mlen++; 1183 } else if (ir->op == ir_tex) { 1184 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1185 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate); 1186 coordinate.reg_offset++; 1187 } 1188 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */ 1189 mlen += 3; 1190 } else if (ir->op == ir_txd) { 1191 assert(!"TXD isn't supported on gen4 yet."); 1192 } else { 1193 /* Oh joy. gen4 doesn't have SIMD8 non-shadow-compare bias/lod 1194 * instructions. We'll need to do SIMD16 here. 1195 */ 1196 assert(ir->op == ir_txb || ir->op == ir_txl); 1197 1198 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1199 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2), coordinate); 1200 coordinate.reg_offset++; 1201 } 1202 1203 /* lod/bias appears after u/v/r. */ 1204 mlen += 6; 1205 1206 if (ir->op == ir_txb) { 1207 ir->lod_info.bias->accept(this); 1208 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1209 mlen++; 1210 } else { 1211 ir->lod_info.lod->accept(this); 1212 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1213 mlen++; 1214 } 1215 1216 /* The unused upper half. */ 1217 mlen++; 1218 1219 /* Now, since we're doing simd16, the return is 2 interleaved 1220 * vec4s where the odd-indexed ones are junk. We'll need to move 1221 * this weirdness around to the expected layout. 1222 */ 1223 simd16 = true; 1224 orig_dst = dst; 1225 dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 1226 2)); 1227 dst.type = BRW_REGISTER_TYPE_F; 1228 } 1229 1230 fs_inst *inst = NULL; 1231 switch (ir->op) { 1232 case ir_tex: 1233 inst = emit(FS_OPCODE_TEX, dst); 1234 break; 1235 case ir_txb: 1236 inst = emit(FS_OPCODE_TXB, dst); 1237 break; 1238 case ir_txl: 1239 inst = emit(FS_OPCODE_TXL, dst); 1240 break; 1241 case ir_txd: 1242 inst = emit(FS_OPCODE_TXD, dst); 1243 break; 1244 case ir_txf: 1245 assert(!"GLSL 1.30 features unsupported"); 1246 break; 1247 } 1248 inst->base_mrf = base_mrf; 1249 inst->mlen = mlen; 1250 1251 if (simd16) { 1252 for (int i = 0; i < 4; i++) { 1253 emit(BRW_OPCODE_MOV, orig_dst, dst); 1254 orig_dst.reg_offset++; 1255 dst.reg_offset += 2; 1256 } 1257 } 1258 1259 return inst; 1260} 1261 1262/* gen5's sampler has slots for u, v, r, array index, then optional 1263 * parameters like shadow comparitor or LOD bias. If optional 1264 * parameters aren't present, those base slots are optional and don't 1265 * need to be included in the message. 1266 * 1267 * We don't fill in the unnecessary slots regardless, which may look 1268 * surprising in the disassembly. 1269 */ 1270fs_inst * 1271fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate) 1272{ 1273 int mlen = 1; /* g0 header always present. */ 1274 int base_mrf = 1; 1275 int reg_width = c->dispatch_width / 8; 1276 1277 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1278 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * reg_width), 1279 coordinate); 1280 coordinate.reg_offset++; 1281 } 1282 mlen += ir->coordinate->type->vector_elements * reg_width; 1283 1284 if (ir->shadow_comparitor) { 1285 mlen = MAX2(mlen, 1 + 4 * reg_width); 1286 1287 ir->shadow_comparitor->accept(this); 1288 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1289 mlen += reg_width; 1290 } 1291 1292 fs_inst *inst = NULL; 1293 switch (ir->op) { 1294 case ir_tex: 1295 inst = emit(FS_OPCODE_TEX, dst); 1296 break; 1297 case ir_txb: 1298 ir->lod_info.bias->accept(this); 1299 mlen = MAX2(mlen, 1 + 4 * reg_width); 1300 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1301 mlen += reg_width; 1302 1303 inst = emit(FS_OPCODE_TXB, dst); 1304 1305 break; 1306 case ir_txl: 1307 ir->lod_info.lod->accept(this); 1308 mlen = MAX2(mlen, 1 + 4 * reg_width); 1309 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1310 mlen += reg_width; 1311 1312 inst = emit(FS_OPCODE_TXL, dst); 1313 break; 1314 case ir_txd: 1315 case ir_txf: 1316 assert(!"GLSL 1.30 features unsupported"); 1317 break; 1318 } 1319 inst->base_mrf = base_mrf; 1320 inst->mlen = mlen; 1321 1322 if (mlen > 11) { 1323 fail("Message length >11 disallowed by hardware\n"); 1324 } 1325 1326 return inst; 1327} 1328 1329void 1330fs_visitor::visit(ir_texture *ir) 1331{ 1332 int sampler; 1333 fs_inst *inst = NULL; 1334 1335 ir->coordinate->accept(this); 1336 fs_reg coordinate = this->result; 1337 1338 if (ir->offset != NULL) { 1339 ir_constant *offset = ir->offset->as_constant(); 1340 assert(offset != NULL); 1341 1342 signed char offsets[3]; 1343 for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) 1344 offsets[i] = (signed char) offset->value.i[i]; 1345 1346 /* Combine all three offsets into a single unsigned dword: 1347 * 1348 * bits 11:8 - U Offset (X component) 1349 * bits 7:4 - V Offset (Y component) 1350 * bits 3:0 - R Offset (Z component) 1351 */ 1352 unsigned offset_bits = 0; 1353 for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) { 1354 const unsigned shift = 4 * (2 - i); 1355 offset_bits |= (offsets[i] << shift) & (0xF << shift); 1356 } 1357 1358 /* Explicitly set up the message header by copying g0 to msg reg m1. */ 1359 emit(BRW_OPCODE_MOV, fs_reg(MRF, 1, BRW_REGISTER_TYPE_UD), 1360 fs_reg(GRF, 0, BRW_REGISTER_TYPE_UD)); 1361 1362 /* Then set the offset bits in DWord 2 of the message header. */ 1363 emit(BRW_OPCODE_MOV, 1364 fs_reg(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 1, 2), 1365 BRW_REGISTER_TYPE_UD)), 1366 fs_reg(brw_imm_uw(offset_bits))); 1367 } 1368 1369 /* Should be lowered by do_lower_texture_projection */ 1370 assert(!ir->projector); 1371 1372 sampler = _mesa_get_sampler_uniform_value(ir->sampler, 1373 ctx->Shader.CurrentFragmentProgram, 1374 &brw->fragment_program->Base); 1375 sampler = c->fp->program.Base.SamplerUnits[sampler]; 1376 1377 /* The 965 requires the EU to do the normalization of GL rectangle 1378 * texture coordinates. We use the program parameter state 1379 * tracking to get the scaling factor. 1380 */ 1381 if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) { 1382 struct gl_program_parameter_list *params = c->fp->program.Base.Parameters; 1383 int tokens[STATE_LENGTH] = { 1384 STATE_INTERNAL, 1385 STATE_TEXRECT_SCALE, 1386 sampler, 1387 0, 1388 0 1389 }; 1390 1391 c->prog_data.param_convert[c->prog_data.nr_params] = 1392 PARAM_NO_CONVERT; 1393 c->prog_data.param_convert[c->prog_data.nr_params + 1] = 1394 PARAM_NO_CONVERT; 1395 1396 fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params); 1397 fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1); 1398 GLuint index = _mesa_add_state_reference(params, 1399 (gl_state_index *)tokens); 1400 1401 this->param_index[c->prog_data.nr_params] = index; 1402 this->param_offset[c->prog_data.nr_params] = 0; 1403 c->prog_data.nr_params++; 1404 this->param_index[c->prog_data.nr_params] = index; 1405 this->param_offset[c->prog_data.nr_params] = 1; 1406 c->prog_data.nr_params++; 1407 1408 fs_reg dst = fs_reg(this, ir->coordinate->type); 1409 fs_reg src = coordinate; 1410 coordinate = dst; 1411 1412 emit(BRW_OPCODE_MUL, dst, src, scale_x); 1413 dst.reg_offset++; 1414 src.reg_offset++; 1415 emit(BRW_OPCODE_MUL, dst, src, scale_y); 1416 } 1417 1418 /* Writemasking doesn't eliminate channels on SIMD8 texture 1419 * samples, so don't worry about them. 1420 */ 1421 fs_reg dst = fs_reg(this, glsl_type::vec4_type); 1422 1423 if (intel->gen < 5) { 1424 inst = emit_texture_gen4(ir, dst, coordinate); 1425 } else { 1426 inst = emit_texture_gen5(ir, dst, coordinate); 1427 } 1428 1429 /* If there's an offset, we already set up m1. To avoid the implied move, 1430 * use the null register. Otherwise, we want an implied move from g0. 1431 */ 1432 if (ir->offset != NULL) 1433 inst->src[0] = fs_reg(brw_null_reg()); 1434 else 1435 inst->src[0] = fs_reg(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW)); 1436 1437 inst->sampler = sampler; 1438 1439 this->result = dst; 1440 1441 if (ir->shadow_comparitor) 1442 inst->shadow_compare = true; 1443 1444 if (ir->type == glsl_type::float_type) { 1445 /* Ignore DEPTH_TEXTURE_MODE swizzling. */ 1446 assert(ir->sampler->type->sampler_shadow); 1447 } else if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) { 1448 fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type); 1449 1450 for (int i = 0; i < 4; i++) { 1451 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i); 1452 fs_reg l = swizzle_dst; 1453 l.reg_offset += i; 1454 1455 if (swiz == SWIZZLE_ZERO) { 1456 emit(BRW_OPCODE_MOV, l, fs_reg(0.0f)); 1457 } else if (swiz == SWIZZLE_ONE) { 1458 emit(BRW_OPCODE_MOV, l, fs_reg(1.0f)); 1459 } else { 1460 fs_reg r = dst; 1461 r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i); 1462 emit(BRW_OPCODE_MOV, l, r); 1463 } 1464 } 1465 this->result = swizzle_dst; 1466 } 1467} 1468 1469void 1470fs_visitor::visit(ir_swizzle *ir) 1471{ 1472 ir->val->accept(this); 1473 fs_reg val = this->result; 1474 1475 if (ir->type->vector_elements == 1) { 1476 this->result.reg_offset += ir->mask.x; 1477 return; 1478 } 1479 1480 fs_reg result = fs_reg(this, ir->type); 1481 this->result = result; 1482 1483 for (unsigned int i = 0; i < ir->type->vector_elements; i++) { 1484 fs_reg channel = val; 1485 int swiz = 0; 1486 1487 switch (i) { 1488 case 0: 1489 swiz = ir->mask.x; 1490 break; 1491 case 1: 1492 swiz = ir->mask.y; 1493 break; 1494 case 2: 1495 swiz = ir->mask.z; 1496 break; 1497 case 3: 1498 swiz = ir->mask.w; 1499 break; 1500 } 1501 1502 channel.reg_offset += swiz; 1503 emit(BRW_OPCODE_MOV, result, channel); 1504 result.reg_offset++; 1505 } 1506} 1507 1508void 1509fs_visitor::visit(ir_discard *ir) 1510{ 1511 fs_reg temp = fs_reg(this, glsl_type::uint_type); 1512 1513 assert(ir->condition == NULL); /* FINISHME */ 1514 1515 emit(FS_OPCODE_DISCARD_NOT, temp, reg_null_d); 1516 emit(FS_OPCODE_DISCARD_AND, reg_null_d, temp); 1517 kill_emitted = true; 1518} 1519 1520void 1521fs_visitor::visit(ir_constant *ir) 1522{ 1523 /* Set this->result to reg at the bottom of the function because some code 1524 * paths will cause this visitor to be applied to other fields. This will 1525 * cause the value stored in this->result to be modified. 1526 * 1527 * Make reg constant so that it doesn't get accidentally modified along the 1528 * way. Yes, I actually had this problem. :( 1529 */ 1530 const fs_reg reg(this, ir->type); 1531 fs_reg dst_reg = reg; 1532 1533 if (ir->type->is_array()) { 1534 const unsigned size = type_size(ir->type->fields.array); 1535 1536 for (unsigned i = 0; i < ir->type->length; i++) { 1537 ir->array_elements[i]->accept(this); 1538 fs_reg src_reg = this->result; 1539 1540 dst_reg.type = src_reg.type; 1541 for (unsigned j = 0; j < size; j++) { 1542 emit(BRW_OPCODE_MOV, dst_reg, src_reg); 1543 src_reg.reg_offset++; 1544 dst_reg.reg_offset++; 1545 } 1546 } 1547 } else if (ir->type->is_record()) { 1548 foreach_list(node, &ir->components) { 1549 ir_instruction *const field = (ir_instruction *) node; 1550 const unsigned size = type_size(field->type); 1551 1552 field->accept(this); 1553 fs_reg src_reg = this->result; 1554 1555 dst_reg.type = src_reg.type; 1556 for (unsigned j = 0; j < size; j++) { 1557 emit(BRW_OPCODE_MOV, dst_reg, src_reg); 1558 src_reg.reg_offset++; 1559 dst_reg.reg_offset++; 1560 } 1561 } 1562 } else { 1563 const unsigned size = type_size(ir->type); 1564 1565 for (unsigned i = 0; i < size; i++) { 1566 switch (ir->type->base_type) { 1567 case GLSL_TYPE_FLOAT: 1568 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i])); 1569 break; 1570 case GLSL_TYPE_UINT: 1571 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i])); 1572 break; 1573 case GLSL_TYPE_INT: 1574 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i])); 1575 break; 1576 case GLSL_TYPE_BOOL: 1577 emit(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i])); 1578 break; 1579 default: 1580 assert(!"Non-float/uint/int/bool constant"); 1581 } 1582 dst_reg.reg_offset++; 1583 } 1584 } 1585 1586 this->result = reg; 1587} 1588 1589void 1590fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir) 1591{ 1592 ir_expression *expr = ir->as_expression(); 1593 1594 if (expr) { 1595 fs_reg op[2]; 1596 fs_inst *inst; 1597 1598 assert(expr->get_num_operands() <= 2); 1599 for (unsigned int i = 0; i < expr->get_num_operands(); i++) { 1600 assert(expr->operands[i]->type->is_scalar()); 1601 1602 expr->operands[i]->accept(this); 1603 op[i] = this->result; 1604 } 1605 1606 switch (expr->operation) { 1607 case ir_unop_logic_not: 1608 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1)); 1609 inst->conditional_mod = BRW_CONDITIONAL_Z; 1610 break; 1611 1612 case ir_binop_logic_xor: 1613 inst = emit(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]); 1614 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1615 break; 1616 1617 case ir_binop_logic_or: 1618 inst = emit(BRW_OPCODE_OR, reg_null_d, op[0], op[1]); 1619 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1620 break; 1621 1622 case ir_binop_logic_and: 1623 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], op[1]); 1624 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1625 break; 1626 1627 case ir_unop_f2b: 1628 if (intel->gen >= 6) { 1629 inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f)); 1630 } else { 1631 inst = emit(BRW_OPCODE_MOV, reg_null_f, op[0]); 1632 } 1633 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1634 break; 1635 1636 case ir_unop_i2b: 1637 if (intel->gen >= 6) { 1638 inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0)); 1639 } else { 1640 inst = emit(BRW_OPCODE_MOV, reg_null_d, op[0]); 1641 } 1642 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1643 break; 1644 1645 case ir_binop_greater: 1646 case ir_binop_gequal: 1647 case ir_binop_less: 1648 case ir_binop_lequal: 1649 case ir_binop_equal: 1650 case ir_binop_all_equal: 1651 case ir_binop_nequal: 1652 case ir_binop_any_nequal: 1653 inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]); 1654 inst->conditional_mod = 1655 brw_conditional_for_comparison(expr->operation); 1656 break; 1657 1658 default: 1659 assert(!"not reached"); 1660 fail("bad cond code\n"); 1661 break; 1662 } 1663 return; 1664 } 1665 1666 ir->accept(this); 1667 1668 if (intel->gen >= 6) { 1669 fs_inst *inst = emit(BRW_OPCODE_AND, reg_null_d, this->result, fs_reg(1)); 1670 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1671 } else { 1672 fs_inst *inst = emit(BRW_OPCODE_MOV, reg_null_d, this->result); 1673 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1674 } 1675} 1676 1677/** 1678 * Emit a gen6 IF statement with the comparison folded into the IF 1679 * instruction. 1680 */ 1681void 1682fs_visitor::emit_if_gen6(ir_if *ir) 1683{ 1684 ir_expression *expr = ir->condition->as_expression(); 1685 1686 if (expr) { 1687 fs_reg op[2]; 1688 fs_inst *inst; 1689 fs_reg temp; 1690 1691 assert(expr->get_num_operands() <= 2); 1692 for (unsigned int i = 0; i < expr->get_num_operands(); i++) { 1693 assert(expr->operands[i]->type->is_scalar()); 1694 1695 expr->operands[i]->accept(this); 1696 op[i] = this->result; 1697 } 1698 1699 switch (expr->operation) { 1700 case ir_unop_logic_not: 1701 inst = emit(BRW_OPCODE_IF, temp, op[0], fs_reg(0)); 1702 inst->conditional_mod = BRW_CONDITIONAL_Z; 1703 return; 1704 1705 case ir_binop_logic_xor: 1706 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]); 1707 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1708 return; 1709 1710 case ir_binop_logic_or: 1711 temp = fs_reg(this, glsl_type::bool_type); 1712 emit(BRW_OPCODE_OR, temp, op[0], op[1]); 1713 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0)); 1714 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1715 return; 1716 1717 case ir_binop_logic_and: 1718 temp = fs_reg(this, glsl_type::bool_type); 1719 emit(BRW_OPCODE_AND, temp, op[0], op[1]); 1720 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0)); 1721 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1722 return; 1723 1724 case ir_unop_f2b: 1725 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0)); 1726 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1727 return; 1728 1729 case ir_unop_i2b: 1730 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0)); 1731 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1732 return; 1733 1734 case ir_binop_greater: 1735 case ir_binop_gequal: 1736 case ir_binop_less: 1737 case ir_binop_lequal: 1738 case ir_binop_equal: 1739 case ir_binop_all_equal: 1740 case ir_binop_nequal: 1741 case ir_binop_any_nequal: 1742 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]); 1743 inst->conditional_mod = 1744 brw_conditional_for_comparison(expr->operation); 1745 return; 1746 default: 1747 assert(!"not reached"); 1748 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0)); 1749 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1750 fail("bad condition\n"); 1751 return; 1752 } 1753 return; 1754 } 1755 1756 ir->condition->accept(this); 1757 1758 fs_inst *inst = emit(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0)); 1759 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1760} 1761 1762void 1763fs_visitor::visit(ir_if *ir) 1764{ 1765 fs_inst *inst; 1766 1767 if (c->dispatch_width == 16) { 1768 fail("Can't support (non-uniform) control flow on 16-wide\n"); 1769 } 1770 1771 /* Don't point the annotation at the if statement, because then it plus 1772 * the then and else blocks get printed. 1773 */ 1774 this->base_ir = ir->condition; 1775 1776 if (intel->gen >= 6) { 1777 emit_if_gen6(ir); 1778 } else { 1779 emit_bool_to_cond_code(ir->condition); 1780 1781 inst = emit(BRW_OPCODE_IF); 1782 inst->predicated = true; 1783 } 1784 1785 foreach_iter(exec_list_iterator, iter, ir->then_instructions) { 1786 ir_instruction *ir = (ir_instruction *)iter.get(); 1787 this->base_ir = ir; 1788 1789 ir->accept(this); 1790 } 1791 1792 if (!ir->else_instructions.is_empty()) { 1793 emit(BRW_OPCODE_ELSE); 1794 1795 foreach_iter(exec_list_iterator, iter, ir->else_instructions) { 1796 ir_instruction *ir = (ir_instruction *)iter.get(); 1797 this->base_ir = ir; 1798 1799 ir->accept(this); 1800 } 1801 } 1802 1803 emit(BRW_OPCODE_ENDIF); 1804} 1805 1806void 1807fs_visitor::visit(ir_loop *ir) 1808{ 1809 fs_reg counter = reg_undef; 1810 1811 if (c->dispatch_width == 16) { 1812 fail("Can't support (non-uniform) control flow on 16-wide\n"); 1813 } 1814 1815 if (ir->counter) { 1816 this->base_ir = ir->counter; 1817 ir->counter->accept(this); 1818 counter = *(variable_storage(ir->counter)); 1819 1820 if (ir->from) { 1821 this->base_ir = ir->from; 1822 ir->from->accept(this); 1823 1824 emit(BRW_OPCODE_MOV, counter, this->result); 1825 } 1826 } 1827 1828 emit(BRW_OPCODE_DO); 1829 1830 if (ir->to) { 1831 this->base_ir = ir->to; 1832 ir->to->accept(this); 1833 1834 fs_inst *inst = emit(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result); 1835 inst->conditional_mod = brw_conditional_for_comparison(ir->cmp); 1836 1837 inst = emit(BRW_OPCODE_BREAK); 1838 inst->predicated = true; 1839 } 1840 1841 foreach_iter(exec_list_iterator, iter, ir->body_instructions) { 1842 ir_instruction *ir = (ir_instruction *)iter.get(); 1843 1844 this->base_ir = ir; 1845 ir->accept(this); 1846 } 1847 1848 if (ir->increment) { 1849 this->base_ir = ir->increment; 1850 ir->increment->accept(this); 1851 emit(BRW_OPCODE_ADD, counter, counter, this->result); 1852 } 1853 1854 emit(BRW_OPCODE_WHILE); 1855} 1856 1857void 1858fs_visitor::visit(ir_loop_jump *ir) 1859{ 1860 switch (ir->mode) { 1861 case ir_loop_jump::jump_break: 1862 emit(BRW_OPCODE_BREAK); 1863 break; 1864 case ir_loop_jump::jump_continue: 1865 emit(BRW_OPCODE_CONTINUE); 1866 break; 1867 } 1868} 1869 1870void 1871fs_visitor::visit(ir_call *ir) 1872{ 1873 assert(!"FINISHME"); 1874} 1875 1876void 1877fs_visitor::visit(ir_return *ir) 1878{ 1879 assert(!"FINISHME"); 1880} 1881 1882void 1883fs_visitor::visit(ir_function *ir) 1884{ 1885 /* Ignore function bodies other than main() -- we shouldn't see calls to 1886 * them since they should all be inlined before we get to ir_to_mesa. 1887 */ 1888 if (strcmp(ir->name, "main") == 0) { 1889 const ir_function_signature *sig; 1890 exec_list empty; 1891 1892 sig = ir->matching_signature(&empty); 1893 1894 assert(sig); 1895 1896 foreach_iter(exec_list_iterator, iter, sig->body) { 1897 ir_instruction *ir = (ir_instruction *)iter.get(); 1898 this->base_ir = ir; 1899 1900 ir->accept(this); 1901 } 1902 } 1903} 1904 1905void 1906fs_visitor::visit(ir_function_signature *ir) 1907{ 1908 assert(!"not reached"); 1909 (void)ir; 1910} 1911 1912fs_inst * 1913fs_visitor::emit(fs_inst inst) 1914{ 1915 fs_inst *list_inst = new(mem_ctx) fs_inst; 1916 *list_inst = inst; 1917 1918 if (force_uncompressed_stack > 0) 1919 list_inst->force_uncompressed = true; 1920 else if (force_sechalf_stack > 0) 1921 list_inst->force_sechalf = true; 1922 1923 list_inst->annotation = this->current_annotation; 1924 list_inst->ir = this->base_ir; 1925 1926 this->instructions.push_tail(list_inst); 1927 1928 return list_inst; 1929} 1930 1931/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */ 1932void 1933fs_visitor::emit_dummy_fs() 1934{ 1935 /* Everyone's favorite color. */ 1936 emit(BRW_OPCODE_MOV, fs_reg(MRF, 2), fs_reg(1.0f)); 1937 emit(BRW_OPCODE_MOV, fs_reg(MRF, 3), fs_reg(0.0f)); 1938 emit(BRW_OPCODE_MOV, fs_reg(MRF, 4), fs_reg(1.0f)); 1939 emit(BRW_OPCODE_MOV, fs_reg(MRF, 5), fs_reg(0.0f)); 1940 1941 fs_inst *write; 1942 write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0)); 1943 write->base_mrf = 0; 1944} 1945 1946/* The register location here is relative to the start of the URB 1947 * data. It will get adjusted to be a real location before 1948 * generate_code() time. 1949 */ 1950struct brw_reg 1951fs_visitor::interp_reg(int location, int channel) 1952{ 1953 int regnr = urb_setup[location] * 2 + channel / 2; 1954 int stride = (channel & 1) * 4; 1955 1956 assert(urb_setup[location] != -1); 1957 1958 return brw_vec1_grf(regnr, stride); 1959} 1960 1961/** Emits the interpolation for the varying inputs. */ 1962void 1963fs_visitor::emit_interpolation_setup_gen4() 1964{ 1965 this->current_annotation = "compute pixel centers"; 1966 this->pixel_x = fs_reg(this, glsl_type::uint_type); 1967 this->pixel_y = fs_reg(this, glsl_type::uint_type); 1968 this->pixel_x.type = BRW_REGISTER_TYPE_UW; 1969 this->pixel_y.type = BRW_REGISTER_TYPE_UW; 1970 1971 emit(FS_OPCODE_PIXEL_X, this->pixel_x); 1972 emit(FS_OPCODE_PIXEL_Y, this->pixel_y); 1973 1974 this->current_annotation = "compute pixel deltas from v0"; 1975 if (brw->has_pln) { 1976 this->delta_x = fs_reg(this, glsl_type::vec2_type); 1977 this->delta_y = this->delta_x; 1978 this->delta_y.reg_offset++; 1979 } else { 1980 this->delta_x = fs_reg(this, glsl_type::float_type); 1981 this->delta_y = fs_reg(this, glsl_type::float_type); 1982 } 1983 emit(BRW_OPCODE_ADD, this->delta_x, 1984 this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0)))); 1985 emit(BRW_OPCODE_ADD, this->delta_y, 1986 this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1)))); 1987 1988 this->current_annotation = "compute pos.w and 1/pos.w"; 1989 /* Compute wpos.w. It's always in our setup, since it's needed to 1990 * interpolate the other attributes. 1991 */ 1992 this->wpos_w = fs_reg(this, glsl_type::float_type); 1993 emit(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y, 1994 interp_reg(FRAG_ATTRIB_WPOS, 3)); 1995 /* Compute the pixel 1/W value from wpos.w. */ 1996 this->pixel_w = fs_reg(this, glsl_type::float_type); 1997 emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w); 1998 this->current_annotation = NULL; 1999} 2000 2001/** Emits the interpolation for the varying inputs. */ 2002void 2003fs_visitor::emit_interpolation_setup_gen6() 2004{ 2005 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW); 2006 2007 /* If the pixel centers end up used, the setup is the same as for gen4. */ 2008 this->current_annotation = "compute pixel centers"; 2009 fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type); 2010 fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type); 2011 int_pixel_x.type = BRW_REGISTER_TYPE_UW; 2012 int_pixel_y.type = BRW_REGISTER_TYPE_UW; 2013 emit(BRW_OPCODE_ADD, 2014 int_pixel_x, 2015 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)), 2016 fs_reg(brw_imm_v(0x10101010))); 2017 emit(BRW_OPCODE_ADD, 2018 int_pixel_y, 2019 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)), 2020 fs_reg(brw_imm_v(0x11001100))); 2021 2022 /* As of gen6, we can no longer mix float and int sources. We have 2023 * to turn the integer pixel centers into floats for their actual 2024 * use. 2025 */ 2026 this->pixel_x = fs_reg(this, glsl_type::float_type); 2027 this->pixel_y = fs_reg(this, glsl_type::float_type); 2028 emit(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x); 2029 emit(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y); 2030 2031 this->current_annotation = "compute pos.w"; 2032 this->pixel_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0)); 2033 this->wpos_w = fs_reg(this, glsl_type::float_type); 2034 emit_math(FS_OPCODE_RCP, this->wpos_w, this->pixel_w); 2035 2036 this->delta_x = fs_reg(brw_vec8_grf(2, 0)); 2037 this->delta_y = fs_reg(brw_vec8_grf(3, 0)); 2038 2039 this->current_annotation = NULL; 2040} 2041 2042void 2043fs_visitor::emit_color_write(int index, int first_color_mrf, fs_reg color) 2044{ 2045 int reg_width = c->dispatch_width / 8; 2046 2047 if (c->dispatch_width == 8 || intel->gen == 6) { 2048 /* SIMD8 write looks like: 2049 * m + 0: r0 2050 * m + 1: r1 2051 * m + 2: g0 2052 * m + 3: g1 2053 * 2054 * gen6 SIMD16 DP write looks like: 2055 * m + 0: r0 2056 * m + 1: r1 2057 * m + 2: g0 2058 * m + 3: g1 2059 * m + 4: b0 2060 * m + 5: b1 2061 * m + 6: a0 2062 * m + 7: a1 2063 */ 2064 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index * reg_width), 2065 color); 2066 } else { 2067 /* pre-gen6 SIMD16 single source DP write looks like: 2068 * m + 0: r0 2069 * m + 1: g0 2070 * m + 2: b0 2071 * m + 3: a0 2072 * m + 4: r1 2073 * m + 5: g1 2074 * m + 6: b1 2075 * m + 7: a1 2076 */ 2077 if (brw->has_compr4) { 2078 /* By setting the high bit of the MRF register number, we 2079 * indicate that we want COMPR4 mode - instead of doing the 2080 * usual destination + 1 for the second half we get 2081 * destination + 4. 2082 */ 2083 emit(BRW_OPCODE_MOV, 2084 fs_reg(MRF, BRW_MRF_COMPR4 + first_color_mrf + index), color); 2085 } else { 2086 push_force_uncompressed(); 2087 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index), color); 2088 pop_force_uncompressed(); 2089 2090 push_force_sechalf(); 2091 color.sechalf = true; 2092 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index + 4), color); 2093 pop_force_sechalf(); 2094 color.sechalf = false; 2095 } 2096 } 2097} 2098 2099void 2100fs_visitor::emit_fb_writes() 2101{ 2102 this->current_annotation = "FB write header"; 2103 GLboolean header_present = GL_TRUE; 2104 int nr = 0; 2105 int reg_width = c->dispatch_width / 8; 2106 2107 if (intel->gen >= 6 && 2108 !this->kill_emitted && 2109 c->key.nr_color_regions == 1) { 2110 header_present = false; 2111 } 2112 2113 if (header_present) { 2114 /* m0, m1 header */ 2115 nr += 2; 2116 } 2117 2118 if (c->aa_dest_stencil_reg) { 2119 push_force_uncompressed(); 2120 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), 2121 fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0))); 2122 pop_force_uncompressed(); 2123 } 2124 2125 /* Reserve space for color. It'll be filled in per MRT below. */ 2126 int color_mrf = nr; 2127 nr += 4 * reg_width; 2128 2129 if (c->source_depth_to_render_target) { 2130 if (intel->gen == 6 && c->dispatch_width == 16) { 2131 /* For outputting oDepth on gen6, SIMD8 writes have to be 2132 * used. This would require 8-wide moves of each half to 2133 * message regs, kind of like pre-gen5 SIMD16 FB writes. 2134 * Just bail on doing so for now. 2135 */ 2136 fail("Missing support for simd16 depth writes on gen6\n"); 2137 } 2138 2139 if (c->computes_depth) { 2140 /* Hand over gl_FragDepth. */ 2141 assert(this->frag_depth); 2142 fs_reg depth = *(variable_storage(this->frag_depth)); 2143 2144 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), depth); 2145 } else { 2146 /* Pass through the payload depth. */ 2147 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), 2148 fs_reg(brw_vec8_grf(c->source_depth_reg, 0))); 2149 } 2150 nr += reg_width; 2151 } 2152 2153 if (c->dest_depth_reg) { 2154 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), 2155 fs_reg(brw_vec8_grf(c->dest_depth_reg, 0))); 2156 nr += reg_width; 2157 } 2158 2159 fs_reg color = reg_undef; 2160 if (this->frag_color) 2161 color = *(variable_storage(this->frag_color)); 2162 else if (this->frag_data) { 2163 color = *(variable_storage(this->frag_data)); 2164 color.type = BRW_REGISTER_TYPE_F; 2165 } 2166 2167 for (int target = 0; target < c->key.nr_color_regions; target++) { 2168 this->current_annotation = ralloc_asprintf(this->mem_ctx, 2169 "FB write target %d", 2170 target); 2171 if (this->frag_color || this->frag_data) { 2172 for (int i = 0; i < 4; i++) { 2173 emit_color_write(i, color_mrf, color); 2174 color.reg_offset++; 2175 } 2176 } 2177 2178 if (this->frag_color) 2179 color.reg_offset -= 4; 2180 2181 fs_inst *inst = emit(FS_OPCODE_FB_WRITE); 2182 inst->target = target; 2183 inst->base_mrf = 0; 2184 inst->mlen = nr; 2185 if (target == c->key.nr_color_regions - 1) 2186 inst->eot = true; 2187 inst->header_present = header_present; 2188 } 2189 2190 if (c->key.nr_color_regions == 0) { 2191 if (c->key.alpha_test && (this->frag_color || this->frag_data)) { 2192 /* If the alpha test is enabled but there's no color buffer, 2193 * we still need to send alpha out the pipeline to our null 2194 * renderbuffer. 2195 */ 2196 color.reg_offset += 3; 2197 emit_color_write(3, color_mrf, color); 2198 } 2199 2200 fs_inst *inst = emit(FS_OPCODE_FB_WRITE); 2201 inst->base_mrf = 0; 2202 inst->mlen = nr; 2203 inst->eot = true; 2204 inst->header_present = header_present; 2205 } 2206 2207 this->current_annotation = NULL; 2208} 2209 2210void 2211fs_visitor::generate_fb_write(fs_inst *inst) 2212{ 2213 GLboolean eot = inst->eot; 2214 struct brw_reg implied_header; 2215 2216 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied 2217 * move, here's g1. 2218 */ 2219 brw_push_insn_state(p); 2220 brw_set_mask_control(p, BRW_MASK_DISABLE); 2221 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2222 2223 if (inst->header_present) { 2224 if (intel->gen >= 6) { 2225 brw_MOV(p, 2226 brw_message_reg(inst->base_mrf), 2227 brw_vec8_grf(0, 0)); 2228 2229 if (inst->target > 0) { 2230 /* Set the render target index for choosing BLEND_STATE. */ 2231 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2), 2232 BRW_REGISTER_TYPE_UD), 2233 brw_imm_ud(inst->target)); 2234 } 2235 2236 /* Clear viewport index, render target array index. */ 2237 brw_AND(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 0), 2238 BRW_REGISTER_TYPE_UD), 2239 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), 2240 brw_imm_ud(0xf7ff)); 2241 2242 implied_header = brw_null_reg(); 2243 } else { 2244 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); 2245 } 2246 2247 brw_MOV(p, 2248 brw_message_reg(inst->base_mrf + 1), 2249 brw_vec8_grf(1, 0)); 2250 } else { 2251 implied_header = brw_null_reg(); 2252 } 2253 2254 brw_pop_insn_state(p); 2255 2256 brw_fb_WRITE(p, 2257 c->dispatch_width, 2258 inst->base_mrf, 2259 implied_header, 2260 inst->target, 2261 inst->mlen, 2262 0, 2263 eot, 2264 inst->header_present); 2265} 2266 2267/* Computes the integer pixel x,y values from the origin. 2268 * 2269 * This is the basis of gl_FragCoord computation, but is also used 2270 * pre-gen6 for computing the deltas from v0 for computing 2271 * interpolation. 2272 */ 2273void 2274fs_visitor::generate_pixel_xy(struct brw_reg dst, bool is_x) 2275{ 2276 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW); 2277 struct brw_reg src; 2278 struct brw_reg deltas; 2279 2280 if (is_x) { 2281 src = stride(suboffset(g1_uw, 4), 2, 4, 0); 2282 deltas = brw_imm_v(0x10101010); 2283 } else { 2284 src = stride(suboffset(g1_uw, 5), 2, 4, 0); 2285 deltas = brw_imm_v(0x11001100); 2286 } 2287 2288 if (c->dispatch_width == 16) { 2289 dst = vec16(dst); 2290 } 2291 2292 /* We do this 8 or 16-wide, but since the destination is UW we 2293 * don't do compression in the 16-wide case. 2294 */ 2295 brw_push_insn_state(p); 2296 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2297 brw_ADD(p, dst, src, deltas); 2298 brw_pop_insn_state(p); 2299} 2300 2301void 2302fs_visitor::generate_linterp(fs_inst *inst, 2303 struct brw_reg dst, struct brw_reg *src) 2304{ 2305 struct brw_reg delta_x = src[0]; 2306 struct brw_reg delta_y = src[1]; 2307 struct brw_reg interp = src[2]; 2308 2309 if (brw->has_pln && 2310 delta_y.nr == delta_x.nr + 1 && 2311 (intel->gen >= 6 || (delta_x.nr & 1) == 0)) { 2312 brw_PLN(p, dst, interp, delta_x); 2313 } else { 2314 brw_LINE(p, brw_null_reg(), interp, delta_x); 2315 brw_MAC(p, dst, suboffset(interp, 1), delta_y); 2316 } 2317} 2318 2319void 2320fs_visitor::generate_math(fs_inst *inst, 2321 struct brw_reg dst, struct brw_reg *src) 2322{ 2323 int op; 2324 2325 switch (inst->opcode) { 2326 case FS_OPCODE_RCP: 2327 op = BRW_MATH_FUNCTION_INV; 2328 break; 2329 case FS_OPCODE_RSQ: 2330 op = BRW_MATH_FUNCTION_RSQ; 2331 break; 2332 case FS_OPCODE_SQRT: 2333 op = BRW_MATH_FUNCTION_SQRT; 2334 break; 2335 case FS_OPCODE_EXP2: 2336 op = BRW_MATH_FUNCTION_EXP; 2337 break; 2338 case FS_OPCODE_LOG2: 2339 op = BRW_MATH_FUNCTION_LOG; 2340 break; 2341 case FS_OPCODE_POW: 2342 op = BRW_MATH_FUNCTION_POW; 2343 break; 2344 case FS_OPCODE_SIN: 2345 op = BRW_MATH_FUNCTION_SIN; 2346 break; 2347 case FS_OPCODE_COS: 2348 op = BRW_MATH_FUNCTION_COS; 2349 break; 2350 default: 2351 assert(!"not reached: unknown math function"); 2352 op = 0; 2353 break; 2354 } 2355 2356 if (intel->gen >= 6) { 2357 assert(inst->mlen == 0); 2358 2359 if (inst->opcode == FS_OPCODE_POW) { 2360 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2361 brw_math2(p, dst, op, src[0], src[1]); 2362 2363 if (c->dispatch_width == 16) { 2364 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); 2365 brw_math2(p, sechalf(dst), op, sechalf(src[0]), sechalf(src[1])); 2366 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 2367 } 2368 } else { 2369 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2370 brw_math(p, dst, 2371 op, 2372 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 2373 BRW_MATH_SATURATE_NONE, 2374 0, src[0], 2375 BRW_MATH_DATA_VECTOR, 2376 BRW_MATH_PRECISION_FULL); 2377 2378 if (c->dispatch_width == 16) { 2379 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); 2380 brw_math(p, sechalf(dst), 2381 op, 2382 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 2383 BRW_MATH_SATURATE_NONE, 2384 0, sechalf(src[0]), 2385 BRW_MATH_DATA_VECTOR, 2386 BRW_MATH_PRECISION_FULL); 2387 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 2388 } 2389 } 2390 } else /* gen <= 5 */{ 2391 assert(inst->mlen >= 1); 2392 2393 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2394 brw_math(p, dst, 2395 op, 2396 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 2397 BRW_MATH_SATURATE_NONE, 2398 inst->base_mrf, src[0], 2399 BRW_MATH_DATA_VECTOR, 2400 BRW_MATH_PRECISION_FULL); 2401 2402 if (c->dispatch_width == 16) { 2403 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); 2404 brw_math(p, sechalf(dst), 2405 op, 2406 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 2407 BRW_MATH_SATURATE_NONE, 2408 inst->base_mrf + 1, sechalf(src[0]), 2409 BRW_MATH_DATA_VECTOR, 2410 BRW_MATH_PRECISION_FULL); 2411 2412 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 2413 } 2414 } 2415} 2416 2417void 2418fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 2419{ 2420 int msg_type = -1; 2421 int rlen = 4; 2422 uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; 2423 2424 if (c->dispatch_width == 16) { 2425 rlen = 8; 2426 dst = vec16(dst); 2427 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 2428 } 2429 2430 if (intel->gen >= 5) { 2431 switch (inst->opcode) { 2432 case FS_OPCODE_TEX: 2433 if (inst->shadow_compare) { 2434 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE; 2435 } else { 2436 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE; 2437 } 2438 break; 2439 case FS_OPCODE_TXB: 2440 if (inst->shadow_compare) { 2441 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE; 2442 } else { 2443 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS; 2444 } 2445 break; 2446 case FS_OPCODE_TXL: 2447 if (inst->shadow_compare) { 2448 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE; 2449 } else { 2450 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD; 2451 } 2452 break; 2453 case FS_OPCODE_TXD: 2454 assert(!"TXD isn't supported on gen5+ yet."); 2455 break; 2456 } 2457 } else { 2458 switch (inst->opcode) { 2459 case FS_OPCODE_TEX: 2460 /* Note that G45 and older determines shadow compare and dispatch width 2461 * from message length for most messages. 2462 */ 2463 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE; 2464 if (inst->shadow_compare) { 2465 assert(inst->mlen == 6); 2466 } else { 2467 assert(inst->mlen <= 4); 2468 } 2469 break; 2470 case FS_OPCODE_TXB: 2471 if (inst->shadow_compare) { 2472 assert(inst->mlen == 6); 2473 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE; 2474 } else { 2475 assert(inst->mlen == 9); 2476 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS; 2477 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 2478 } 2479 break; 2480 case FS_OPCODE_TXL: 2481 if (inst->shadow_compare) { 2482 assert(inst->mlen == 6); 2483 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE; 2484 } else { 2485 assert(inst->mlen == 9); 2486 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD; 2487 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 2488 } 2489 break; 2490 case FS_OPCODE_TXD: 2491 assert(!"TXD isn't supported on gen4 yet."); 2492 break; 2493 } 2494 } 2495 assert(msg_type != -1); 2496 2497 brw_SAMPLE(p, 2498 retype(dst, BRW_REGISTER_TYPE_UW), 2499 inst->base_mrf, 2500 src, 2501 SURF_INDEX_TEXTURE(inst->sampler), 2502 inst->sampler, 2503 WRITEMASK_XYZW, 2504 msg_type, 2505 rlen, 2506 inst->mlen, 2507 0, 2508 1, 2509 simd_mode); 2510} 2511 2512 2513/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input 2514 * looking like: 2515 * 2516 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br 2517 * 2518 * and we're trying to produce: 2519 * 2520 * DDX DDY 2521 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl) 2522 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br) 2523 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl) 2524 * (ss0.br - ss0.bl) (ss0.tr - ss0.br) 2525 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl) 2526 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br) 2527 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl) 2528 * (ss1.br - ss1.bl) (ss1.tr - ss1.br) 2529 * 2530 * and add another set of two more subspans if in 16-pixel dispatch mode. 2531 * 2532 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result 2533 * for each pair, and vertstride = 2 jumps us 2 elements after processing a 2534 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled 2535 * between each other. We could probably do it like ddx and swizzle the right 2536 * order later, but bail for now and just produce 2537 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4) 2538 */ 2539void 2540fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 2541{ 2542 struct brw_reg src0 = brw_reg(src.file, src.nr, 1, 2543 BRW_REGISTER_TYPE_F, 2544 BRW_VERTICAL_STRIDE_2, 2545 BRW_WIDTH_2, 2546 BRW_HORIZONTAL_STRIDE_0, 2547 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2548 struct brw_reg src1 = brw_reg(src.file, src.nr, 0, 2549 BRW_REGISTER_TYPE_F, 2550 BRW_VERTICAL_STRIDE_2, 2551 BRW_WIDTH_2, 2552 BRW_HORIZONTAL_STRIDE_0, 2553 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2554 brw_ADD(p, dst, src0, negate(src1)); 2555} 2556 2557void 2558fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 2559{ 2560 struct brw_reg src0 = brw_reg(src.file, src.nr, 0, 2561 BRW_REGISTER_TYPE_F, 2562 BRW_VERTICAL_STRIDE_4, 2563 BRW_WIDTH_4, 2564 BRW_HORIZONTAL_STRIDE_0, 2565 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2566 struct brw_reg src1 = brw_reg(src.file, src.nr, 2, 2567 BRW_REGISTER_TYPE_F, 2568 BRW_VERTICAL_STRIDE_4, 2569 BRW_WIDTH_4, 2570 BRW_HORIZONTAL_STRIDE_0, 2571 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2572 brw_ADD(p, dst, src0, negate(src1)); 2573} 2574 2575void 2576fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask) 2577{ 2578 if (intel->gen >= 6) { 2579 /* Gen6 no longer has the mask reg for us to just read the 2580 * active channels from. However, cmp updates just the channels 2581 * of the flag reg that are enabled, so we can get at the 2582 * channel enables that way. In this step, make a reg of ones 2583 * we'll compare to. 2584 */ 2585 brw_MOV(p, mask, brw_imm_ud(1)); 2586 } else { 2587 brw_push_insn_state(p); 2588 brw_set_mask_control(p, BRW_MASK_DISABLE); 2589 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2590 brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */ 2591 brw_pop_insn_state(p); 2592 } 2593} 2594 2595void 2596fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask) 2597{ 2598 if (intel->gen >= 6) { 2599 struct brw_reg f0 = brw_flag_reg(); 2600 struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); 2601 2602 brw_push_insn_state(p); 2603 brw_set_mask_control(p, BRW_MASK_DISABLE); 2604 brw_MOV(p, f0, brw_imm_uw(0xffff)); /* inactive channels undiscarded */ 2605 brw_pop_insn_state(p); 2606 2607 brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), 2608 BRW_CONDITIONAL_Z, mask, brw_imm_ud(0)); /* active channels fail test */ 2609 /* Undo CMP's whacking of predication*/ 2610 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 2611 2612 brw_push_insn_state(p); 2613 brw_set_mask_control(p, BRW_MASK_DISABLE); 2614 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2615 brw_AND(p, g1, f0, g1); 2616 brw_pop_insn_state(p); 2617 } else { 2618 struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); 2619 2620 mask = brw_uw1_reg(mask.file, mask.nr, 0); 2621 2622 brw_push_insn_state(p); 2623 brw_set_mask_control(p, BRW_MASK_DISABLE); 2624 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2625 brw_AND(p, g0, mask, g0); 2626 brw_pop_insn_state(p); 2627 } 2628} 2629 2630void 2631fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src) 2632{ 2633 assert(inst->mlen != 0); 2634 2635 brw_MOV(p, 2636 retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD), 2637 retype(src, BRW_REGISTER_TYPE_UD)); 2638 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1, 2639 inst->offset); 2640} 2641 2642void 2643fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst) 2644{ 2645 assert(inst->mlen != 0); 2646 2647 /* Clear any post destination dependencies that would be ignored by 2648 * the block read. See the B-Spec for pre-gen5 send instruction. 2649 * 2650 * This could use a better solution, since texture sampling and 2651 * math reads could potentially run into it as well -- anywhere 2652 * that we have a SEND with a destination that is a register that 2653 * was written but not read within the last N instructions (what's 2654 * N? unsure). This is rare because of dead code elimination, but 2655 * not impossible. 2656 */ 2657 if (intel->gen == 4 && !intel->is_g4x) 2658 brw_MOV(p, brw_null_reg(), dst); 2659 2660 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1, 2661 inst->offset); 2662 2663 if (intel->gen == 4 && !intel->is_g4x) { 2664 /* gen4 errata: destination from a send can't be used as a 2665 * destination until it's been read. Just read it so we don't 2666 * have to worry. 2667 */ 2668 brw_MOV(p, brw_null_reg(), dst); 2669 } 2670} 2671 2672 2673void 2674fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst) 2675{ 2676 assert(inst->mlen != 0); 2677 2678 /* Clear any post destination dependencies that would be ignored by 2679 * the block read. See the B-Spec for pre-gen5 send instruction. 2680 * 2681 * This could use a better solution, since texture sampling and 2682 * math reads could potentially run into it as well -- anywhere 2683 * that we have a SEND with a destination that is a register that 2684 * was written but not read within the last N instructions (what's 2685 * N? unsure). This is rare because of dead code elimination, but 2686 * not impossible. 2687 */ 2688 if (intel->gen == 4 && !intel->is_g4x) 2689 brw_MOV(p, brw_null_reg(), dst); 2690 2691 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf), 2692 inst->offset, SURF_INDEX_FRAG_CONST_BUFFER); 2693 2694 if (intel->gen == 4 && !intel->is_g4x) { 2695 /* gen4 errata: destination from a send can't be used as a 2696 * destination until it's been read. Just read it so we don't 2697 * have to worry. 2698 */ 2699 brw_MOV(p, brw_null_reg(), dst); 2700 } 2701} 2702 2703/** 2704 * To be called after the last _mesa_add_state_reference() call, to 2705 * set up prog_data.param[] for assign_curb_setup() and 2706 * setup_pull_constants(). 2707 */ 2708void 2709fs_visitor::setup_paramvalues_refs() 2710{ 2711 /* Set up the pointers to ParamValues now that that array is finalized. */ 2712 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 2713 c->prog_data.param[i] = 2714 fp->Base.Parameters->ParameterValues[this->param_index[i]] + 2715 this->param_offset[i]; 2716 } 2717} 2718 2719void 2720fs_visitor::assign_curb_setup() 2721{ 2722 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8; 2723 if (c->dispatch_width == 8) { 2724 c->prog_data.first_curbe_grf = c->nr_payload_regs; 2725 } else { 2726 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs; 2727 } 2728 2729 /* Map the offsets in the UNIFORM file to fixed HW regs. */ 2730 foreach_iter(exec_list_iterator, iter, this->instructions) { 2731 fs_inst *inst = (fs_inst *)iter.get(); 2732 2733 for (unsigned int i = 0; i < 3; i++) { 2734 if (inst->src[i].file == UNIFORM) { 2735 int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 2736 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs + 2737 constant_nr / 8, 2738 constant_nr % 8); 2739 2740 inst->src[i].file = FIXED_HW_REG; 2741 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type); 2742 } 2743 } 2744 } 2745} 2746 2747void 2748fs_visitor::calculate_urb_setup() 2749{ 2750 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 2751 urb_setup[i] = -1; 2752 } 2753 2754 int urb_next = 0; 2755 /* Figure out where each of the incoming setup attributes lands. */ 2756 if (intel->gen >= 6) { 2757 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 2758 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) { 2759 urb_setup[i] = urb_next++; 2760 } 2761 } 2762 } else { 2763 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */ 2764 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) { 2765 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) { 2766 int fp_index; 2767 2768 if (i >= VERT_RESULT_VAR0) 2769 fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0); 2770 else if (i <= VERT_RESULT_TEX7) 2771 fp_index = i; 2772 else 2773 fp_index = -1; 2774 2775 if (fp_index >= 0) 2776 urb_setup[fp_index] = urb_next++; 2777 } 2778 } 2779 } 2780 2781 /* Each attribute is 4 setup channels, each of which is half a reg. */ 2782 c->prog_data.urb_read_length = urb_next * 2; 2783} 2784 2785void 2786fs_visitor::assign_urb_setup() 2787{ 2788 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length; 2789 2790 /* Offset all the urb_setup[] index by the actual position of the 2791 * setup regs, now that the location of the constants has been chosen. 2792 */ 2793 foreach_iter(exec_list_iterator, iter, this->instructions) { 2794 fs_inst *inst = (fs_inst *)iter.get(); 2795 2796 if (inst->opcode == FS_OPCODE_LINTERP) { 2797 assert(inst->src[2].file == FIXED_HW_REG); 2798 inst->src[2].fixed_hw_reg.nr += urb_start; 2799 } 2800 2801 if (inst->opcode == FS_OPCODE_CINTERP) { 2802 assert(inst->src[0].file == FIXED_HW_REG); 2803 inst->src[0].fixed_hw_reg.nr += urb_start; 2804 } 2805 } 2806 2807 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length; 2808} 2809 2810/** 2811 * Split large virtual GRFs into separate components if we can. 2812 * 2813 * This is mostly duplicated with what brw_fs_vector_splitting does, 2814 * but that's really conservative because it's afraid of doing 2815 * splitting that doesn't result in real progress after the rest of 2816 * the optimization phases, which would cause infinite looping in 2817 * optimization. We can do it once here, safely. This also has the 2818 * opportunity to split interpolated values, or maybe even uniforms, 2819 * which we don't have at the IR level. 2820 * 2821 * We want to split, because virtual GRFs are what we register 2822 * allocate and spill (due to contiguousness requirements for some 2823 * instructions), and they're what we naturally generate in the 2824 * codegen process, but most virtual GRFs don't actually need to be 2825 * contiguous sets of GRFs. If we split, we'll end up with reduced 2826 * live intervals and better dead code elimination and coalescing. 2827 */ 2828void 2829fs_visitor::split_virtual_grfs() 2830{ 2831 int num_vars = this->virtual_grf_next; 2832 bool split_grf[num_vars]; 2833 int new_virtual_grf[num_vars]; 2834 2835 /* Try to split anything > 0 sized. */ 2836 for (int i = 0; i < num_vars; i++) { 2837 if (this->virtual_grf_sizes[i] != 1) 2838 split_grf[i] = true; 2839 else 2840 split_grf[i] = false; 2841 } 2842 2843 if (brw->has_pln) { 2844 /* PLN opcodes rely on the delta_xy being contiguous. */ 2845 split_grf[this->delta_x.reg] = false; 2846 } 2847 2848 foreach_iter(exec_list_iterator, iter, this->instructions) { 2849 fs_inst *inst = (fs_inst *)iter.get(); 2850 2851 /* Texturing produces 4 contiguous registers, so no splitting. */ 2852 if (inst->is_tex()) { 2853 split_grf[inst->dst.reg] = false; 2854 } 2855 } 2856 2857 /* Allocate new space for split regs. Note that the virtual 2858 * numbers will be contiguous. 2859 */ 2860 for (int i = 0; i < num_vars; i++) { 2861 if (split_grf[i]) { 2862 new_virtual_grf[i] = virtual_grf_alloc(1); 2863 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) { 2864 int reg = virtual_grf_alloc(1); 2865 assert(reg == new_virtual_grf[i] + j - 1); 2866 (void) reg; 2867 } 2868 this->virtual_grf_sizes[i] = 1; 2869 } 2870 } 2871 2872 foreach_iter(exec_list_iterator, iter, this->instructions) { 2873 fs_inst *inst = (fs_inst *)iter.get(); 2874 2875 if (inst->dst.file == GRF && 2876 split_grf[inst->dst.reg] && 2877 inst->dst.reg_offset != 0) { 2878 inst->dst.reg = (new_virtual_grf[inst->dst.reg] + 2879 inst->dst.reg_offset - 1); 2880 inst->dst.reg_offset = 0; 2881 } 2882 for (int i = 0; i < 3; i++) { 2883 if (inst->src[i].file == GRF && 2884 split_grf[inst->src[i].reg] && 2885 inst->src[i].reg_offset != 0) { 2886 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] + 2887 inst->src[i].reg_offset - 1); 2888 inst->src[i].reg_offset = 0; 2889 } 2890 } 2891 } 2892 this->live_intervals_valid = false; 2893} 2894 2895/** 2896 * Choose accesses from the UNIFORM file to demote to using the pull 2897 * constant buffer. 2898 * 2899 * We allow a fragment shader to have more than the specified minimum 2900 * maximum number of fragment shader uniform components (64). If 2901 * there are too many of these, they'd fill up all of register space. 2902 * So, this will push some of them out to the pull constant buffer and 2903 * update the program to load them. 2904 */ 2905void 2906fs_visitor::setup_pull_constants() 2907{ 2908 /* Only allow 16 registers (128 uniform components) as push constants. */ 2909 unsigned int max_uniform_components = 16 * 8; 2910 if (c->prog_data.nr_params <= max_uniform_components) 2911 return; 2912 2913 /* Just demote the end of the list. We could probably do better 2914 * here, demoting things that are rarely used in the program first. 2915 */ 2916 int pull_uniform_base = max_uniform_components; 2917 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base; 2918 2919 foreach_iter(exec_list_iterator, iter, this->instructions) { 2920 fs_inst *inst = (fs_inst *)iter.get(); 2921 2922 for (int i = 0; i < 3; i++) { 2923 if (inst->src[i].file != UNIFORM) 2924 continue; 2925 2926 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 2927 if (uniform_nr < pull_uniform_base) 2928 continue; 2929 2930 fs_reg dst = fs_reg(this, glsl_type::float_type); 2931 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD, 2932 dst); 2933 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15; 2934 pull->ir = inst->ir; 2935 pull->annotation = inst->annotation; 2936 pull->base_mrf = 14; 2937 pull->mlen = 1; 2938 2939 inst->insert_before(pull); 2940 2941 inst->src[i].file = GRF; 2942 inst->src[i].reg = dst.reg; 2943 inst->src[i].reg_offset = 0; 2944 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3; 2945 } 2946 } 2947 2948 for (int i = 0; i < pull_uniform_count; i++) { 2949 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i]; 2950 c->prog_data.pull_param_convert[i] = 2951 c->prog_data.param_convert[pull_uniform_base + i]; 2952 } 2953 c->prog_data.nr_params -= pull_uniform_count; 2954 c->prog_data.nr_pull_params = pull_uniform_count; 2955} 2956 2957void 2958fs_visitor::calculate_live_intervals() 2959{ 2960 int num_vars = this->virtual_grf_next; 2961 int *def = ralloc_array(mem_ctx, int, num_vars); 2962 int *use = ralloc_array(mem_ctx, int, num_vars); 2963 int loop_depth = 0; 2964 int loop_start = 0; 2965 int bb_header_ip = 0; 2966 2967 if (this->live_intervals_valid) 2968 return; 2969 2970 for (int i = 0; i < num_vars; i++) { 2971 def[i] = MAX_INSTRUCTION; 2972 use[i] = -1; 2973 } 2974 2975 int ip = 0; 2976 foreach_iter(exec_list_iterator, iter, this->instructions) { 2977 fs_inst *inst = (fs_inst *)iter.get(); 2978 2979 if (inst->opcode == BRW_OPCODE_DO) { 2980 if (loop_depth++ == 0) 2981 loop_start = ip; 2982 } else if (inst->opcode == BRW_OPCODE_WHILE) { 2983 loop_depth--; 2984 2985 if (loop_depth == 0) { 2986 /* Patches up the use of vars marked for being live across 2987 * the whole loop. 2988 */ 2989 for (int i = 0; i < num_vars; i++) { 2990 if (use[i] == loop_start) { 2991 use[i] = ip; 2992 } 2993 } 2994 } 2995 } else { 2996 for (unsigned int i = 0; i < 3; i++) { 2997 if (inst->src[i].file == GRF && inst->src[i].reg != 0) { 2998 int reg = inst->src[i].reg; 2999 3000 if (!loop_depth) { 3001 use[reg] = ip; 3002 } else { 3003 def[reg] = MIN2(loop_start, def[reg]); 3004 use[reg] = loop_start; 3005 3006 /* Nobody else is going to go smash our start to 3007 * later in the loop now, because def[reg] now 3008 * points before the bb header. 3009 */ 3010 } 3011 } 3012 } 3013 if (inst->dst.file == GRF && inst->dst.reg != 0) { 3014 int reg = inst->dst.reg; 3015 3016 if (!loop_depth) { 3017 def[reg] = MIN2(def[reg], ip); 3018 } else { 3019 def[reg] = MIN2(def[reg], loop_start); 3020 } 3021 } 3022 } 3023 3024 ip++; 3025 3026 /* Set the basic block header IP. This is used for determining 3027 * if a complete def of single-register virtual GRF in a loop 3028 * dominates a use in the same basic block. It's a quick way to 3029 * reduce the live interval range of most register used in a 3030 * loop. 3031 */ 3032 if (inst->opcode == BRW_OPCODE_IF || 3033 inst->opcode == BRW_OPCODE_ELSE || 3034 inst->opcode == BRW_OPCODE_ENDIF || 3035 inst->opcode == BRW_OPCODE_DO || 3036 inst->opcode == BRW_OPCODE_WHILE || 3037 inst->opcode == BRW_OPCODE_BREAK || 3038 inst->opcode == BRW_OPCODE_CONTINUE) { 3039 bb_header_ip = ip; 3040 } 3041 } 3042 3043 ralloc_free(this->virtual_grf_def); 3044 ralloc_free(this->virtual_grf_use); 3045 this->virtual_grf_def = def; 3046 this->virtual_grf_use = use; 3047 3048 this->live_intervals_valid = true; 3049} 3050 3051/** 3052 * Attempts to move immediate constants into the immediate 3053 * constant slot of following instructions. 3054 * 3055 * Immediate constants are a bit tricky -- they have to be in the last 3056 * operand slot, you can't do abs/negate on them, 3057 */ 3058 3059bool 3060fs_visitor::propagate_constants() 3061{ 3062 bool progress = false; 3063 3064 /* Need to update the MRF tracking for compressed instructions. */ 3065 if (c->dispatch_width == 16) 3066 return false; 3067 3068 calculate_live_intervals(); 3069 3070 foreach_iter(exec_list_iterator, iter, this->instructions) { 3071 fs_inst *inst = (fs_inst *)iter.get(); 3072 3073 if (inst->opcode != BRW_OPCODE_MOV || 3074 inst->predicated || 3075 inst->dst.file != GRF || inst->src[0].file != IMM || 3076 inst->dst.type != inst->src[0].type) 3077 continue; 3078 3079 /* Don't bother with cases where we should have had the 3080 * operation on the constant folded in GLSL already. 3081 */ 3082 if (inst->saturate) 3083 continue; 3084 3085 /* Found a move of a constant to a GRF. Find anything else using the GRF 3086 * before it's written, and replace it with the constant if we can. 3087 */ 3088 exec_list_iterator scan_iter = iter; 3089 scan_iter.next(); 3090 for (; scan_iter.has_next(); scan_iter.next()) { 3091 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 3092 3093 if (scan_inst->opcode == BRW_OPCODE_DO || 3094 scan_inst->opcode == BRW_OPCODE_WHILE || 3095 scan_inst->opcode == BRW_OPCODE_ELSE || 3096 scan_inst->opcode == BRW_OPCODE_ENDIF) { 3097 break; 3098 } 3099 3100 for (int i = 2; i >= 0; i--) { 3101 if (scan_inst->src[i].file != GRF || 3102 scan_inst->src[i].reg != inst->dst.reg || 3103 scan_inst->src[i].reg_offset != inst->dst.reg_offset) 3104 continue; 3105 3106 /* Don't bother with cases where we should have had the 3107 * operation on the constant folded in GLSL already. 3108 */ 3109 if (scan_inst->src[i].negate || scan_inst->src[i].abs) 3110 continue; 3111 3112 switch (scan_inst->opcode) { 3113 case BRW_OPCODE_MOV: 3114 scan_inst->src[i] = inst->src[0]; 3115 progress = true; 3116 break; 3117 3118 case BRW_OPCODE_MUL: 3119 case BRW_OPCODE_ADD: 3120 if (i == 1) { 3121 scan_inst->src[i] = inst->src[0]; 3122 progress = true; 3123 } else if (i == 0 && scan_inst->src[1].file != IMM) { 3124 /* Fit this constant in by commuting the operands */ 3125 scan_inst->src[0] = scan_inst->src[1]; 3126 scan_inst->src[1] = inst->src[0]; 3127 progress = true; 3128 } 3129 break; 3130 3131 case BRW_OPCODE_CMP: 3132 if (i == 1) { 3133 scan_inst->src[i] = inst->src[0]; 3134 progress = true; 3135 } else if (i == 0 && scan_inst->src[1].file != IMM) { 3136 uint32_t new_cmod; 3137 3138 new_cmod = brw_swap_cmod(scan_inst->conditional_mod); 3139 if (new_cmod != ~0u) { 3140 /* Fit this constant in by swapping the operands and 3141 * flipping the test 3142 */ 3143 scan_inst->src[0] = scan_inst->src[1]; 3144 scan_inst->src[1] = inst->src[0]; 3145 scan_inst->conditional_mod = new_cmod; 3146 progress = true; 3147 } 3148 } 3149 break; 3150 3151 case BRW_OPCODE_SEL: 3152 if (i == 1) { 3153 scan_inst->src[i] = inst->src[0]; 3154 progress = true; 3155 } else if (i == 0 && scan_inst->src[1].file != IMM) { 3156 /* Fit this constant in by swapping the operands and 3157 * flipping the predicate 3158 */ 3159 scan_inst->src[0] = scan_inst->src[1]; 3160 scan_inst->src[1] = inst->src[0]; 3161 scan_inst->predicate_inverse = !scan_inst->predicate_inverse; 3162 progress = true; 3163 } 3164 break; 3165 } 3166 } 3167 3168 if (scan_inst->dst.file == GRF && 3169 scan_inst->dst.reg == inst->dst.reg && 3170 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 3171 scan_inst->is_tex())) { 3172 break; 3173 } 3174 } 3175 } 3176 3177 if (progress) 3178 this->live_intervals_valid = false; 3179 3180 return progress; 3181} 3182/** 3183 * Must be called after calculate_live_intervales() to remove unused 3184 * writes to registers -- register allocation will fail otherwise 3185 * because something deffed but not used won't be considered to 3186 * interfere with other regs. 3187 */ 3188bool 3189fs_visitor::dead_code_eliminate() 3190{ 3191 bool progress = false; 3192 int pc = 0; 3193 3194 calculate_live_intervals(); 3195 3196 foreach_iter(exec_list_iterator, iter, this->instructions) { 3197 fs_inst *inst = (fs_inst *)iter.get(); 3198 3199 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) { 3200 inst->remove(); 3201 progress = true; 3202 } 3203 3204 pc++; 3205 } 3206 3207 if (progress) 3208 live_intervals_valid = false; 3209 3210 return progress; 3211} 3212 3213bool 3214fs_visitor::register_coalesce() 3215{ 3216 bool progress = false; 3217 int if_depth = 0; 3218 int loop_depth = 0; 3219 3220 foreach_iter(exec_list_iterator, iter, this->instructions) { 3221 fs_inst *inst = (fs_inst *)iter.get(); 3222 3223 /* Make sure that we dominate the instructions we're going to 3224 * scan for interfering with our coalescing, or we won't have 3225 * scanned enough to see if anything interferes with our 3226 * coalescing. We don't dominate the following instructions if 3227 * we're in a loop or an if block. 3228 */ 3229 switch (inst->opcode) { 3230 case BRW_OPCODE_DO: 3231 loop_depth++; 3232 break; 3233 case BRW_OPCODE_WHILE: 3234 loop_depth--; 3235 break; 3236 case BRW_OPCODE_IF: 3237 if_depth++; 3238 break; 3239 case BRW_OPCODE_ENDIF: 3240 if_depth--; 3241 break; 3242 } 3243 if (loop_depth || if_depth) 3244 continue; 3245 3246 if (inst->opcode != BRW_OPCODE_MOV || 3247 inst->predicated || 3248 inst->saturate || 3249 inst->dst.file != GRF || inst->src[0].file != GRF || 3250 inst->dst.type != inst->src[0].type) 3251 continue; 3252 3253 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate; 3254 3255 /* Found a move of a GRF to a GRF. Let's see if we can coalesce 3256 * them: check for no writes to either one until the exit of the 3257 * program. 3258 */ 3259 bool interfered = false; 3260 exec_list_iterator scan_iter = iter; 3261 scan_iter.next(); 3262 for (; scan_iter.has_next(); scan_iter.next()) { 3263 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 3264 3265 if (scan_inst->dst.file == GRF) { 3266 if (scan_inst->dst.reg == inst->dst.reg && 3267 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 3268 scan_inst->is_tex())) { 3269 interfered = true; 3270 break; 3271 } 3272 if (scan_inst->dst.reg == inst->src[0].reg && 3273 (scan_inst->dst.reg_offset == inst->src[0].reg_offset || 3274 scan_inst->is_tex())) { 3275 interfered = true; 3276 break; 3277 } 3278 } 3279 3280 /* The gen6 MATH instruction can't handle source modifiers, so avoid 3281 * coalescing those for now. We should do something more specific. 3282 */ 3283 if (intel->gen == 6 && scan_inst->is_math() && has_source_modifiers) { 3284 interfered = true; 3285 break; 3286 } 3287 } 3288 if (interfered) { 3289 continue; 3290 } 3291 3292 /* Rewrite the later usage to point at the source of the move to 3293 * be removed. 3294 */ 3295 for (exec_list_iterator scan_iter = iter; scan_iter.has_next(); 3296 scan_iter.next()) { 3297 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 3298 3299 for (int i = 0; i < 3; i++) { 3300 if (scan_inst->src[i].file == GRF && 3301 scan_inst->src[i].reg == inst->dst.reg && 3302 scan_inst->src[i].reg_offset == inst->dst.reg_offset) { 3303 scan_inst->src[i].reg = inst->src[0].reg; 3304 scan_inst->src[i].reg_offset = inst->src[0].reg_offset; 3305 scan_inst->src[i].abs |= inst->src[0].abs; 3306 scan_inst->src[i].negate ^= inst->src[0].negate; 3307 scan_inst->src[i].smear = inst->src[0].smear; 3308 } 3309 } 3310 } 3311 3312 inst->remove(); 3313 progress = true; 3314 } 3315 3316 if (progress) 3317 live_intervals_valid = false; 3318 3319 return progress; 3320} 3321 3322 3323bool 3324fs_visitor::compute_to_mrf() 3325{ 3326 bool progress = false; 3327 int next_ip = 0; 3328 3329 /* Need to update the MRF tracking for compressed instructions. */ 3330 if (c->dispatch_width == 16) 3331 return false; 3332 3333 calculate_live_intervals(); 3334 3335 foreach_iter(exec_list_iterator, iter, this->instructions) { 3336 fs_inst *inst = (fs_inst *)iter.get(); 3337 3338 int ip = next_ip; 3339 next_ip++; 3340 3341 if (inst->opcode != BRW_OPCODE_MOV || 3342 inst->predicated || 3343 inst->dst.file != MRF || inst->src[0].file != GRF || 3344 inst->dst.type != inst->src[0].type || 3345 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1) 3346 continue; 3347 3348 /* Can't compute-to-MRF this GRF if someone else was going to 3349 * read it later. 3350 */ 3351 if (this->virtual_grf_use[inst->src[0].reg] > ip) 3352 continue; 3353 3354 /* Found a move of a GRF to a MRF. Let's see if we can go 3355 * rewrite the thing that made this GRF to write into the MRF. 3356 */ 3357 fs_inst *scan_inst; 3358 for (scan_inst = (fs_inst *)inst->prev; 3359 scan_inst->prev != NULL; 3360 scan_inst = (fs_inst *)scan_inst->prev) { 3361 if (scan_inst->dst.file == GRF && 3362 scan_inst->dst.reg == inst->src[0].reg) { 3363 /* Found the last thing to write our reg we want to turn 3364 * into a compute-to-MRF. 3365 */ 3366 3367 if (scan_inst->is_tex()) { 3368 /* texturing writes several continuous regs, so we can't 3369 * compute-to-mrf that. 3370 */ 3371 break; 3372 } 3373 3374 /* If it's predicated, it (probably) didn't populate all 3375 * the channels. 3376 */ 3377 if (scan_inst->predicated) 3378 break; 3379 3380 /* SEND instructions can't have MRF as a destination. */ 3381 if (scan_inst->mlen) 3382 break; 3383 3384 if (intel->gen >= 6) { 3385 /* gen6 math instructions must have the destination be 3386 * GRF, so no compute-to-MRF for them. 3387 */ 3388 if (scan_inst->is_math()) { 3389 break; 3390 } 3391 } 3392 3393 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 3394 /* Found the creator of our MRF's source value. */ 3395 scan_inst->dst.file = MRF; 3396 scan_inst->dst.hw_reg = inst->dst.hw_reg; 3397 scan_inst->saturate |= inst->saturate; 3398 inst->remove(); 3399 progress = true; 3400 } 3401 break; 3402 } 3403 3404 /* We don't handle flow control here. Most computation of 3405 * values that end up in MRFs are shortly before the MRF 3406 * write anyway. 3407 */ 3408 if (scan_inst->opcode == BRW_OPCODE_DO || 3409 scan_inst->opcode == BRW_OPCODE_WHILE || 3410 scan_inst->opcode == BRW_OPCODE_ELSE || 3411 scan_inst->opcode == BRW_OPCODE_ENDIF) { 3412 break; 3413 } 3414 3415 /* You can't read from an MRF, so if someone else reads our 3416 * MRF's source GRF that we wanted to rewrite, that stops us. 3417 */ 3418 bool interfered = false; 3419 for (int i = 0; i < 3; i++) { 3420 if (scan_inst->src[i].file == GRF && 3421 scan_inst->src[i].reg == inst->src[0].reg && 3422 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { 3423 interfered = true; 3424 } 3425 } 3426 if (interfered) 3427 break; 3428 3429 if (scan_inst->dst.file == MRF && 3430 scan_inst->dst.hw_reg == inst->dst.hw_reg) { 3431 /* Somebody else wrote our MRF here, so we can't can't 3432 * compute-to-MRF before that. 3433 */ 3434 break; 3435 } 3436 3437 if (scan_inst->mlen > 0) { 3438 /* Found a SEND instruction, which means that there are 3439 * live values in MRFs from base_mrf to base_mrf + 3440 * scan_inst->mlen - 1. Don't go pushing our MRF write up 3441 * above it. 3442 */ 3443 if (inst->dst.hw_reg >= scan_inst->base_mrf && 3444 inst->dst.hw_reg < scan_inst->base_mrf + scan_inst->mlen) { 3445 break; 3446 } 3447 } 3448 } 3449 } 3450 3451 return progress; 3452} 3453 3454/** 3455 * Walks through basic blocks, locking for repeated MRF writes and 3456 * removing the later ones. 3457 */ 3458bool 3459fs_visitor::remove_duplicate_mrf_writes() 3460{ 3461 fs_inst *last_mrf_move[16]; 3462 bool progress = false; 3463 3464 /* Need to update the MRF tracking for compressed instructions. */ 3465 if (c->dispatch_width == 16) 3466 return false; 3467 3468 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 3469 3470 foreach_iter(exec_list_iterator, iter, this->instructions) { 3471 fs_inst *inst = (fs_inst *)iter.get(); 3472 3473 switch (inst->opcode) { 3474 case BRW_OPCODE_DO: 3475 case BRW_OPCODE_WHILE: 3476 case BRW_OPCODE_IF: 3477 case BRW_OPCODE_ELSE: 3478 case BRW_OPCODE_ENDIF: 3479 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 3480 continue; 3481 default: 3482 break; 3483 } 3484 3485 if (inst->opcode == BRW_OPCODE_MOV && 3486 inst->dst.file == MRF) { 3487 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg]; 3488 if (prev_inst && inst->equals(prev_inst)) { 3489 inst->remove(); 3490 progress = true; 3491 continue; 3492 } 3493 } 3494 3495 /* Clear out the last-write records for MRFs that were overwritten. */ 3496 if (inst->dst.file == MRF) { 3497 last_mrf_move[inst->dst.hw_reg] = NULL; 3498 } 3499 3500 if (inst->mlen > 0) { 3501 /* Found a SEND instruction, which will include two or fewer 3502 * implied MRF writes. We could do better here. 3503 */ 3504 for (int i = 0; i < implied_mrf_writes(inst); i++) { 3505 last_mrf_move[inst->base_mrf + i] = NULL; 3506 } 3507 } 3508 3509 /* Clear out any MRF move records whose sources got overwritten. */ 3510 if (inst->dst.file == GRF) { 3511 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) { 3512 if (last_mrf_move[i] && 3513 last_mrf_move[i]->src[0].reg == inst->dst.reg) { 3514 last_mrf_move[i] = NULL; 3515 } 3516 } 3517 } 3518 3519 if (inst->opcode == BRW_OPCODE_MOV && 3520 inst->dst.file == MRF && 3521 inst->src[0].file == GRF && 3522 !inst->predicated) { 3523 last_mrf_move[inst->dst.hw_reg] = inst; 3524 } 3525 } 3526 3527 return progress; 3528} 3529 3530bool 3531fs_visitor::virtual_grf_interferes(int a, int b) 3532{ 3533 int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]); 3534 int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]); 3535 3536 /* We can't handle dead register writes here, without iterating 3537 * over the whole instruction stream to find every single dead 3538 * write to that register to compare to the live interval of the 3539 * other register. Just assert that dead_code_eliminate() has been 3540 * called. 3541 */ 3542 assert((this->virtual_grf_use[a] != -1 || 3543 this->virtual_grf_def[a] == MAX_INSTRUCTION) && 3544 (this->virtual_grf_use[b] != -1 || 3545 this->virtual_grf_def[b] == MAX_INSTRUCTION)); 3546 3547 /* If the register is used to store 16 values of less than float 3548 * size (only the case for pixel_[xy]), then we can't allocate 3549 * another dword-sized thing to that register that would be used in 3550 * the same instruction. This is because when the GPU decodes (for 3551 * example): 3552 * 3553 * (declare (in ) vec4 gl_FragCoord@0x97766a0) 3554 * add(16) g6<1>F g6<8,8,1>UW 0.5F { align1 compr }; 3555 * 3556 * it's actually processed as: 3557 * add(8) g6<1>F g6<8,8,1>UW 0.5F { align1 }; 3558 * add(8) g7<1>F g6.8<8,8,1>UW 0.5F { align1 sechalf }; 3559 * 3560 * so our second half values in g6 got overwritten in the first 3561 * half. 3562 */ 3563 if (c->dispatch_width == 16 && (this->pixel_x.reg == a || 3564 this->pixel_x.reg == b || 3565 this->pixel_y.reg == a || 3566 this->pixel_y.reg == b)) { 3567 return start <= end; 3568 } 3569 3570 return start < end; 3571} 3572 3573static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg) 3574{ 3575 struct brw_reg brw_reg; 3576 3577 switch (reg->file) { 3578 case GRF: 3579 case ARF: 3580 case MRF: 3581 if (reg->smear == -1) { 3582 brw_reg = brw_vec8_reg(reg->file, 3583 reg->hw_reg, 0); 3584 } else { 3585 brw_reg = brw_vec1_reg(reg->file, 3586 reg->hw_reg, reg->smear); 3587 } 3588 brw_reg = retype(brw_reg, reg->type); 3589 if (reg->sechalf) 3590 brw_reg = sechalf(brw_reg); 3591 break; 3592 case IMM: 3593 switch (reg->type) { 3594 case BRW_REGISTER_TYPE_F: 3595 brw_reg = brw_imm_f(reg->imm.f); 3596 break; 3597 case BRW_REGISTER_TYPE_D: 3598 brw_reg = brw_imm_d(reg->imm.i); 3599 break; 3600 case BRW_REGISTER_TYPE_UD: 3601 brw_reg = brw_imm_ud(reg->imm.u); 3602 break; 3603 default: 3604 assert(!"not reached"); 3605 brw_reg = brw_null_reg(); 3606 break; 3607 } 3608 break; 3609 case FIXED_HW_REG: 3610 brw_reg = reg->fixed_hw_reg; 3611 break; 3612 case BAD_FILE: 3613 /* Probably unused. */ 3614 brw_reg = brw_null_reg(); 3615 break; 3616 case UNIFORM: 3617 assert(!"not reached"); 3618 brw_reg = brw_null_reg(); 3619 break; 3620 default: 3621 assert(!"not reached"); 3622 brw_reg = brw_null_reg(); 3623 break; 3624 } 3625 if (reg->abs) 3626 brw_reg = brw_abs(brw_reg); 3627 if (reg->negate) 3628 brw_reg = negate(brw_reg); 3629 3630 return brw_reg; 3631} 3632 3633void 3634fs_visitor::generate_code() 3635{ 3636 int last_native_inst = p->nr_insn; 3637 const char *last_annotation_string = NULL; 3638 ir_instruction *last_annotation_ir = NULL; 3639 3640 int if_stack_array_size = 16; 3641 int loop_stack_array_size = 16; 3642 int if_stack_depth = 0, loop_stack_depth = 0; 3643 brw_instruction **if_stack = 3644 rzalloc_array(this->mem_ctx, brw_instruction *, if_stack_array_size); 3645 brw_instruction **loop_stack = 3646 rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size); 3647 int *if_depth_in_loop = 3648 rzalloc_array(this->mem_ctx, int, loop_stack_array_size); 3649 3650 3651 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3652 printf("Native code for fragment shader %d (%d-wide dispatch):\n", 3653 ctx->Shader.CurrentFragmentProgram->Name, c->dispatch_width); 3654 } 3655 3656 foreach_iter(exec_list_iterator, iter, this->instructions) { 3657 fs_inst *inst = (fs_inst *)iter.get(); 3658 struct brw_reg src[3], dst; 3659 3660 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3661 if (last_annotation_ir != inst->ir) { 3662 last_annotation_ir = inst->ir; 3663 if (last_annotation_ir) { 3664 printf(" "); 3665 last_annotation_ir->print(); 3666 printf("\n"); 3667 } 3668 } 3669 if (last_annotation_string != inst->annotation) { 3670 last_annotation_string = inst->annotation; 3671 if (last_annotation_string) 3672 printf(" %s\n", last_annotation_string); 3673 } 3674 } 3675 3676 for (unsigned int i = 0; i < 3; i++) { 3677 src[i] = brw_reg_from_fs_reg(&inst->src[i]); 3678 } 3679 dst = brw_reg_from_fs_reg(&inst->dst); 3680 3681 brw_set_conditionalmod(p, inst->conditional_mod); 3682 brw_set_predicate_control(p, inst->predicated); 3683 brw_set_predicate_inverse(p, inst->predicate_inverse); 3684 brw_set_saturate(p, inst->saturate); 3685 3686 if (inst->force_uncompressed || c->dispatch_width == 8) { 3687 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 3688 } else if (inst->force_sechalf) { 3689 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); 3690 } else { 3691 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 3692 } 3693 3694 switch (inst->opcode) { 3695 case BRW_OPCODE_MOV: 3696 brw_MOV(p, dst, src[0]); 3697 break; 3698 case BRW_OPCODE_ADD: 3699 brw_ADD(p, dst, src[0], src[1]); 3700 break; 3701 case BRW_OPCODE_MUL: 3702 brw_MUL(p, dst, src[0], src[1]); 3703 break; 3704 3705 case BRW_OPCODE_FRC: 3706 brw_FRC(p, dst, src[0]); 3707 break; 3708 case BRW_OPCODE_RNDD: 3709 brw_RNDD(p, dst, src[0]); 3710 break; 3711 case BRW_OPCODE_RNDE: 3712 brw_RNDE(p, dst, src[0]); 3713 break; 3714 case BRW_OPCODE_RNDZ: 3715 brw_RNDZ(p, dst, src[0]); 3716 break; 3717 3718 case BRW_OPCODE_AND: 3719 brw_AND(p, dst, src[0], src[1]); 3720 break; 3721 case BRW_OPCODE_OR: 3722 brw_OR(p, dst, src[0], src[1]); 3723 break; 3724 case BRW_OPCODE_XOR: 3725 brw_XOR(p, dst, src[0], src[1]); 3726 break; 3727 case BRW_OPCODE_NOT: 3728 brw_NOT(p, dst, src[0]); 3729 break; 3730 case BRW_OPCODE_ASR: 3731 brw_ASR(p, dst, src[0], src[1]); 3732 break; 3733 case BRW_OPCODE_SHR: 3734 brw_SHR(p, dst, src[0], src[1]); 3735 break; 3736 case BRW_OPCODE_SHL: 3737 brw_SHL(p, dst, src[0], src[1]); 3738 break; 3739 3740 case BRW_OPCODE_CMP: 3741 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); 3742 break; 3743 case BRW_OPCODE_SEL: 3744 brw_SEL(p, dst, src[0], src[1]); 3745 break; 3746 3747 case BRW_OPCODE_IF: 3748 if (inst->src[0].file != BAD_FILE) { 3749 assert(intel->gen >= 6); 3750 if_stack[if_stack_depth] = gen6_IF(p, inst->conditional_mod, src[0], src[1]); 3751 } else { 3752 if_stack[if_stack_depth] = brw_IF(p, BRW_EXECUTE_8); 3753 } 3754 if_depth_in_loop[loop_stack_depth]++; 3755 if_stack_depth++; 3756 if (if_stack_array_size <= if_stack_depth) { 3757 if_stack_array_size *= 2; 3758 if_stack = reralloc(this->mem_ctx, if_stack, brw_instruction *, 3759 if_stack_array_size); 3760 } 3761 break; 3762 3763 case BRW_OPCODE_ELSE: 3764 if_stack[if_stack_depth - 1] = 3765 brw_ELSE(p, if_stack[if_stack_depth - 1]); 3766 break; 3767 case BRW_OPCODE_ENDIF: 3768 if_stack_depth--; 3769 brw_ENDIF(p , if_stack[if_stack_depth]); 3770 if_depth_in_loop[loop_stack_depth]--; 3771 break; 3772 3773 case BRW_OPCODE_DO: 3774 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8); 3775 if (loop_stack_array_size <= loop_stack_depth) { 3776 loop_stack_array_size *= 2; 3777 loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *, 3778 loop_stack_array_size); 3779 if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int, 3780 loop_stack_array_size); 3781 } 3782 if_depth_in_loop[loop_stack_depth] = 0; 3783 break; 3784 3785 case BRW_OPCODE_BREAK: 3786 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]); 3787 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 3788 break; 3789 case BRW_OPCODE_CONTINUE: 3790 /* FINISHME: We need to write the loop instruction support still. */ 3791 if (intel->gen >= 6) 3792 gen6_CONT(p, loop_stack[loop_stack_depth - 1]); 3793 else 3794 brw_CONT(p, if_depth_in_loop[loop_stack_depth]); 3795 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 3796 break; 3797 3798 case BRW_OPCODE_WHILE: { 3799 struct brw_instruction *inst0, *inst1; 3800 GLuint br = 1; 3801 3802 if (intel->gen >= 5) 3803 br = 2; 3804 3805 assert(loop_stack_depth > 0); 3806 loop_stack_depth--; 3807 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]); 3808 if (intel->gen < 6) { 3809 /* patch all the BREAK/CONT instructions from last BGNLOOP */ 3810 while (inst0 > loop_stack[loop_stack_depth]) { 3811 inst0--; 3812 if (inst0->header.opcode == BRW_OPCODE_BREAK && 3813 inst0->bits3.if_else.jump_count == 0) { 3814 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1); 3815 } 3816 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE && 3817 inst0->bits3.if_else.jump_count == 0) { 3818 inst0->bits3.if_else.jump_count = br * (inst1 - inst0); 3819 } 3820 } 3821 } 3822 } 3823 break; 3824 3825 case FS_OPCODE_RCP: 3826 case FS_OPCODE_RSQ: 3827 case FS_OPCODE_SQRT: 3828 case FS_OPCODE_EXP2: 3829 case FS_OPCODE_LOG2: 3830 case FS_OPCODE_POW: 3831 case FS_OPCODE_SIN: 3832 case FS_OPCODE_COS: 3833 generate_math(inst, dst, src); 3834 break; 3835 case FS_OPCODE_PIXEL_X: 3836 generate_pixel_xy(dst, true); 3837 break; 3838 case FS_OPCODE_PIXEL_Y: 3839 generate_pixel_xy(dst, false); 3840 break; 3841 case FS_OPCODE_CINTERP: 3842 brw_MOV(p, dst, src[0]); 3843 break; 3844 case FS_OPCODE_LINTERP: 3845 generate_linterp(inst, dst, src); 3846 break; 3847 case FS_OPCODE_TEX: 3848 case FS_OPCODE_TXB: 3849 case FS_OPCODE_TXD: 3850 case FS_OPCODE_TXL: 3851 generate_tex(inst, dst, src[0]); 3852 break; 3853 case FS_OPCODE_DISCARD_NOT: 3854 generate_discard_not(inst, dst); 3855 break; 3856 case FS_OPCODE_DISCARD_AND: 3857 generate_discard_and(inst, src[0]); 3858 break; 3859 case FS_OPCODE_DDX: 3860 generate_ddx(inst, dst, src[0]); 3861 break; 3862 case FS_OPCODE_DDY: 3863 generate_ddy(inst, dst, src[0]); 3864 break; 3865 3866 case FS_OPCODE_SPILL: 3867 generate_spill(inst, src[0]); 3868 break; 3869 3870 case FS_OPCODE_UNSPILL: 3871 generate_unspill(inst, dst); 3872 break; 3873 3874 case FS_OPCODE_PULL_CONSTANT_LOAD: 3875 generate_pull_constant_load(inst, dst); 3876 break; 3877 3878 case FS_OPCODE_FB_WRITE: 3879 generate_fb_write(inst); 3880 break; 3881 default: 3882 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) { 3883 _mesa_problem(ctx, "Unsupported opcode `%s' in FS", 3884 brw_opcodes[inst->opcode].name); 3885 } else { 3886 _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode); 3887 } 3888 fail("unsupported opcode in FS\n"); 3889 } 3890 3891 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3892 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) { 3893 if (0) { 3894 printf("0x%08x 0x%08x 0x%08x 0x%08x ", 3895 ((uint32_t *)&p->store[i])[3], 3896 ((uint32_t *)&p->store[i])[2], 3897 ((uint32_t *)&p->store[i])[1], 3898 ((uint32_t *)&p->store[i])[0]); 3899 } 3900 brw_disasm(stdout, &p->store[i], intel->gen); 3901 } 3902 } 3903 3904 last_native_inst = p->nr_insn; 3905 } 3906 3907 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3908 printf("\n"); 3909 } 3910 3911 ralloc_free(if_stack); 3912 ralloc_free(loop_stack); 3913 ralloc_free(if_depth_in_loop); 3914 3915 brw_set_uip_jip(p); 3916 3917 /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS 3918 * emit issues, it doesn't get the jump distances into the output, 3919 * which is often something we want to debug. So this is here in 3920 * case you're doing that. 3921 */ 3922 if (0) { 3923 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3924 for (unsigned int i = 0; i < p->nr_insn; i++) { 3925 printf("0x%08x 0x%08x 0x%08x 0x%08x ", 3926 ((uint32_t *)&p->store[i])[3], 3927 ((uint32_t *)&p->store[i])[2], 3928 ((uint32_t *)&p->store[i])[1], 3929 ((uint32_t *)&p->store[i])[0]); 3930 brw_disasm(stdout, &p->store[i], intel->gen); 3931 } 3932 } 3933 } 3934} 3935 3936bool 3937fs_visitor::run() 3938{ 3939 uint32_t prog_offset_16 = 0; 3940 3941 brw_wm_payload_setup(brw, c); 3942 3943 if (c->dispatch_width == 16) { 3944 if (c->prog_data.curb_read_length) { 3945 /* Haven't hooked in support for uniforms through the 16-wide 3946 * version yet. 3947 */ 3948 return false; 3949 } 3950 3951 /* align to 64 byte boundary. */ 3952 while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) { 3953 brw_NOP(p); 3954 } 3955 3956 /* Save off the start of this 16-wide program in case we succeed. */ 3957 prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction); 3958 3959 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 3960 } 3961 3962 if (0) { 3963 emit_dummy_fs(); 3964 } else { 3965 calculate_urb_setup(); 3966 if (intel->gen < 6) 3967 emit_interpolation_setup_gen4(); 3968 else 3969 emit_interpolation_setup_gen6(); 3970 3971 /* Generate FS IR for main(). (the visitor only descends into 3972 * functions called "main"). 3973 */ 3974 foreach_iter(exec_list_iterator, iter, *shader->ir) { 3975 ir_instruction *ir = (ir_instruction *)iter.get(); 3976 base_ir = ir; 3977 ir->accept(this); 3978 } 3979 3980 emit_fb_writes(); 3981 3982 split_virtual_grfs(); 3983 3984 setup_paramvalues_refs(); 3985 setup_pull_constants(); 3986 3987 bool progress; 3988 do { 3989 progress = false; 3990 3991 progress = remove_duplicate_mrf_writes() || progress; 3992 3993 progress = propagate_constants() || progress; 3994 progress = register_coalesce() || progress; 3995 progress = compute_to_mrf() || progress; 3996 progress = dead_code_eliminate() || progress; 3997 } while (progress); 3998 3999 schedule_instructions(); 4000 4001 assign_curb_setup(); 4002 assign_urb_setup(); 4003 4004 if (0) { 4005 /* Debug of register spilling: Go spill everything. */ 4006 int virtual_grf_count = virtual_grf_next; 4007 for (int i = 1; i < virtual_grf_count; i++) { 4008 spill_reg(i); 4009 } 4010 } 4011 4012 if (0) 4013 assign_regs_trivial(); 4014 else { 4015 while (!assign_regs()) { 4016 if (failed) 4017 break; 4018 } 4019 } 4020 } 4021 assert(force_uncompressed_stack == 0); 4022 assert(force_sechalf_stack == 0); 4023 4024 if (failed) 4025 return false; 4026 4027 generate_code(); 4028 4029 if (c->dispatch_width == 8) { 4030 c->prog_data.total_grf = grf_used; 4031 } else { 4032 c->prog_data.total_grf_16 = grf_used; 4033 c->prog_data.prog_offset_16 = prog_offset_16; 4034 } 4035 4036 return !failed; 4037} 4038 4039bool 4040brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c) 4041{ 4042 struct intel_context *intel = &brw->intel; 4043 struct gl_context *ctx = &intel->ctx; 4044 struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram; 4045 4046 if (!prog) 4047 return false; 4048 4049 struct brw_shader *shader = 4050 (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; 4051 if (!shader) 4052 return false; 4053 4054 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 4055 printf("GLSL IR for native fragment shader %d:\n", prog->Name); 4056 _mesa_print_ir(shader->ir, NULL); 4057 printf("\n\n"); 4058 } 4059 4060 /* Now the main event: Visit the shader IR and generate our FS IR for it. 4061 */ 4062 c->dispatch_width = 8; 4063 4064 fs_visitor v(c, shader); 4065 if (!v.run()) { 4066 /* FINISHME: Cleanly fail, test at link time, etc. */ 4067 assert(!"not reached"); 4068 return false; 4069 } 4070 4071 if (intel->gen >= 5) { 4072 c->dispatch_width = 16; 4073 fs_visitor v2(c, shader); 4074 v2.run(); 4075 } 4076 4077 c->prog_data.dispatch_width = 8; 4078 4079 return true; 4080} 4081