brw_fs.cpp revision f147599ef4b0d14c25a7e0d3f9f1c9b0229bb6fc
1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Eric Anholt <eric@anholt.net> 25 * 26 */ 27 28extern "C" { 29 30#include <sys/types.h> 31 32#include "main/macros.h" 33#include "main/shaderobj.h" 34#include "main/uniforms.h" 35#include "program/prog_parameter.h" 36#include "program/prog_print.h" 37#include "program/prog_optimize.h" 38#include "program/register_allocate.h" 39#include "program/sampler.h" 40#include "program/hash_table.h" 41#include "brw_context.h" 42#include "brw_eu.h" 43#include "brw_wm.h" 44} 45#include "brw_fs.h" 46#include "../glsl/glsl_types.h" 47#include "../glsl/ir_optimization.h" 48#include "../glsl/ir_print_visitor.h" 49 50#define MAX_INSTRUCTION (1 << 30) 51static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg); 52 53struct gl_shader * 54brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type) 55{ 56 struct brw_shader *shader; 57 58 shader = rzalloc(NULL, struct brw_shader); 59 if (shader) { 60 shader->base.Type = type; 61 shader->base.Name = name; 62 _mesa_init_shader(ctx, &shader->base); 63 } 64 65 return &shader->base; 66} 67 68struct gl_shader_program * 69brw_new_shader_program(struct gl_context *ctx, GLuint name) 70{ 71 struct brw_shader_program *prog; 72 prog = rzalloc(NULL, struct brw_shader_program); 73 if (prog) { 74 prog->base.Name = name; 75 _mesa_init_shader_program(ctx, &prog->base); 76 } 77 return &prog->base; 78} 79 80GLboolean 81brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog) 82{ 83 struct brw_context *brw = brw_context(ctx); 84 struct intel_context *intel = &brw->intel; 85 86 struct brw_shader *shader = 87 (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; 88 if (shader != NULL) { 89 void *mem_ctx = ralloc_context(NULL); 90 bool progress; 91 92 if (shader->ir) 93 ralloc_free(shader->ir); 94 shader->ir = new(shader) exec_list; 95 clone_ir_list(mem_ctx, shader->ir, shader->base.ir); 96 97 do_mat_op_to_vec(shader->ir); 98 lower_instructions(shader->ir, 99 MOD_TO_FRACT | 100 DIV_TO_MUL_RCP | 101 SUB_TO_ADD_NEG | 102 EXP_TO_EXP2 | 103 LOG_TO_LOG2); 104 105 /* Pre-gen6 HW can only nest if-statements 16 deep. Beyond this, 106 * if-statements need to be flattened. 107 */ 108 if (intel->gen < 6) 109 lower_if_to_cond_assign(shader->ir, 16); 110 111 do_lower_texture_projection(shader->ir); 112 do_vec_index_to_cond_assign(shader->ir); 113 brw_do_cubemap_normalize(shader->ir); 114 lower_noise(shader->ir); 115 lower_quadop_vector(shader->ir, false); 116 lower_variable_index_to_cond_assign(shader->ir, 117 GL_TRUE, /* input */ 118 GL_TRUE, /* output */ 119 GL_TRUE, /* temp */ 120 GL_TRUE /* uniform */ 121 ); 122 123 do { 124 progress = false; 125 126 brw_do_channel_expressions(shader->ir); 127 brw_do_vector_splitting(shader->ir); 128 129 progress = do_lower_jumps(shader->ir, true, true, 130 true, /* main return */ 131 false, /* continue */ 132 false /* loops */ 133 ) || progress; 134 135 progress = do_common_optimization(shader->ir, true, 32) || progress; 136 } while (progress); 137 138 validate_ir_tree(shader->ir); 139 140 reparent_ir(shader->ir, shader->ir); 141 ralloc_free(mem_ctx); 142 } 143 144 if (!_mesa_ir_link_shader(ctx, prog)) 145 return GL_FALSE; 146 147 return GL_TRUE; 148} 149 150static int 151type_size(const struct glsl_type *type) 152{ 153 unsigned int size, i; 154 155 switch (type->base_type) { 156 case GLSL_TYPE_UINT: 157 case GLSL_TYPE_INT: 158 case GLSL_TYPE_FLOAT: 159 case GLSL_TYPE_BOOL: 160 return type->components(); 161 case GLSL_TYPE_ARRAY: 162 return type_size(type->fields.array) * type->length; 163 case GLSL_TYPE_STRUCT: 164 size = 0; 165 for (i = 0; i < type->length; i++) { 166 size += type_size(type->fields.structure[i].type); 167 } 168 return size; 169 case GLSL_TYPE_SAMPLER: 170 /* Samplers take up no register space, since they're baked in at 171 * link time. 172 */ 173 return 0; 174 default: 175 assert(!"not reached"); 176 return 0; 177 } 178} 179 180void 181fs_visitor::fail(const char *format, ...) 182{ 183 if (!failed) { 184 failed = true; 185 186 if (INTEL_DEBUG & DEBUG_WM) { 187 fprintf(stderr, "FS compile failed: "); 188 189 va_list va; 190 va_start(va, format); 191 vfprintf(stderr, format, va); 192 va_end(va); 193 } 194 } 195} 196 197void 198fs_visitor::push_force_uncompressed() 199{ 200 force_uncompressed_stack++; 201} 202 203void 204fs_visitor::pop_force_uncompressed() 205{ 206 force_uncompressed_stack--; 207 assert(force_uncompressed_stack >= 0); 208} 209 210void 211fs_visitor::push_force_sechalf() 212{ 213 force_sechalf_stack++; 214} 215 216void 217fs_visitor::pop_force_sechalf() 218{ 219 force_sechalf_stack--; 220 assert(force_sechalf_stack >= 0); 221} 222 223/** 224 * Returns how many MRFs an FS opcode will write over. 225 * 226 * Note that this is not the 0 or 1 implied writes in an actual gen 227 * instruction -- the FS opcodes often generate MOVs in addition. 228 */ 229int 230fs_visitor::implied_mrf_writes(fs_inst *inst) 231{ 232 if (inst->mlen == 0) 233 return 0; 234 235 switch (inst->opcode) { 236 case FS_OPCODE_RCP: 237 case FS_OPCODE_RSQ: 238 case FS_OPCODE_SQRT: 239 case FS_OPCODE_EXP2: 240 case FS_OPCODE_LOG2: 241 case FS_OPCODE_SIN: 242 case FS_OPCODE_COS: 243 return 1 * c->dispatch_width / 8; 244 case FS_OPCODE_POW: 245 return 2 * c->dispatch_width / 8; 246 case FS_OPCODE_TEX: 247 case FS_OPCODE_TXB: 248 case FS_OPCODE_TXD: 249 case FS_OPCODE_TXL: 250 return 1; 251 case FS_OPCODE_FB_WRITE: 252 return 2; 253 case FS_OPCODE_PULL_CONSTANT_LOAD: 254 case FS_OPCODE_UNSPILL: 255 return 1; 256 case FS_OPCODE_SPILL: 257 return 2; 258 default: 259 assert(!"not reached"); 260 return inst->mlen; 261 } 262} 263 264int 265fs_visitor::virtual_grf_alloc(int size) 266{ 267 if (virtual_grf_array_size <= virtual_grf_next) { 268 if (virtual_grf_array_size == 0) 269 virtual_grf_array_size = 16; 270 else 271 virtual_grf_array_size *= 2; 272 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int, 273 virtual_grf_array_size); 274 275 /* This slot is always unused. */ 276 virtual_grf_sizes[0] = 0; 277 } 278 virtual_grf_sizes[virtual_grf_next] = size; 279 return virtual_grf_next++; 280} 281 282/** Fixed HW reg constructor. */ 283fs_reg::fs_reg(enum register_file file, int hw_reg) 284{ 285 init(); 286 this->file = file; 287 this->hw_reg = hw_reg; 288 this->type = BRW_REGISTER_TYPE_F; 289} 290 291/** Fixed HW reg constructor. */ 292fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type) 293{ 294 init(); 295 this->file = file; 296 this->hw_reg = hw_reg; 297 this->type = type; 298} 299 300int 301brw_type_for_base_type(const struct glsl_type *type) 302{ 303 switch (type->base_type) { 304 case GLSL_TYPE_FLOAT: 305 return BRW_REGISTER_TYPE_F; 306 case GLSL_TYPE_INT: 307 case GLSL_TYPE_BOOL: 308 return BRW_REGISTER_TYPE_D; 309 case GLSL_TYPE_UINT: 310 return BRW_REGISTER_TYPE_UD; 311 case GLSL_TYPE_ARRAY: 312 case GLSL_TYPE_STRUCT: 313 case GLSL_TYPE_SAMPLER: 314 /* These should be overridden with the type of the member when 315 * dereferenced into. BRW_REGISTER_TYPE_UD seems like a likely 316 * way to trip up if we don't. 317 */ 318 return BRW_REGISTER_TYPE_UD; 319 default: 320 assert(!"not reached"); 321 return BRW_REGISTER_TYPE_F; 322 } 323} 324 325/** Automatic reg constructor. */ 326fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type) 327{ 328 init(); 329 330 this->file = GRF; 331 this->reg = v->virtual_grf_alloc(type_size(type)); 332 this->reg_offset = 0; 333 this->type = brw_type_for_base_type(type); 334} 335 336fs_reg * 337fs_visitor::variable_storage(ir_variable *var) 338{ 339 return (fs_reg *)hash_table_find(this->variable_ht, var); 340} 341 342void 343import_uniforms_callback(const void *key, 344 void *data, 345 void *closure) 346{ 347 struct hash_table *dst_ht = (struct hash_table *)closure; 348 const fs_reg *reg = (const fs_reg *)data; 349 350 if (reg->file != UNIFORM) 351 return; 352 353 hash_table_insert(dst_ht, data, key); 354} 355 356/* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch. 357 * This brings in those uniform definitions 358 */ 359void 360fs_visitor::import_uniforms(struct hash_table *src_variable_ht) 361{ 362 hash_table_call_foreach(src_variable_ht, 363 import_uniforms_callback, 364 variable_ht); 365} 366 367/* Our support for uniforms is piggy-backed on the struct 368 * gl_fragment_program, because that's where the values actually 369 * get stored, rather than in some global gl_shader_program uniform 370 * store. 371 */ 372int 373fs_visitor::setup_uniform_values(int loc, const glsl_type *type) 374{ 375 unsigned int offset = 0; 376 377 if (type->is_matrix()) { 378 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT, 379 type->vector_elements, 380 1); 381 382 for (unsigned int i = 0; i < type->matrix_columns; i++) { 383 offset += setup_uniform_values(loc + offset, column); 384 } 385 386 return offset; 387 } 388 389 switch (type->base_type) { 390 case GLSL_TYPE_FLOAT: 391 case GLSL_TYPE_UINT: 392 case GLSL_TYPE_INT: 393 case GLSL_TYPE_BOOL: 394 for (unsigned int i = 0; i < type->vector_elements; i++) { 395 unsigned int param = c->prog_data.nr_params++; 396 397 assert(param < ARRAY_SIZE(c->prog_data.param)); 398 399 switch (type->base_type) { 400 case GLSL_TYPE_FLOAT: 401 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 402 break; 403 case GLSL_TYPE_UINT: 404 c->prog_data.param_convert[param] = PARAM_CONVERT_F2U; 405 break; 406 case GLSL_TYPE_INT: 407 c->prog_data.param_convert[param] = PARAM_CONVERT_F2I; 408 break; 409 case GLSL_TYPE_BOOL: 410 c->prog_data.param_convert[param] = PARAM_CONVERT_F2B; 411 break; 412 default: 413 assert(!"not reached"); 414 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 415 break; 416 } 417 this->param_index[param] = loc; 418 this->param_offset[param] = i; 419 } 420 return 1; 421 422 case GLSL_TYPE_STRUCT: 423 for (unsigned int i = 0; i < type->length; i++) { 424 offset += setup_uniform_values(loc + offset, 425 type->fields.structure[i].type); 426 } 427 return offset; 428 429 case GLSL_TYPE_ARRAY: 430 for (unsigned int i = 0; i < type->length; i++) { 431 offset += setup_uniform_values(loc + offset, type->fields.array); 432 } 433 return offset; 434 435 case GLSL_TYPE_SAMPLER: 436 /* The sampler takes up a slot, but we don't use any values from it. */ 437 return 1; 438 439 default: 440 assert(!"not reached"); 441 return 0; 442 } 443} 444 445 446/* Our support for builtin uniforms is even scarier than non-builtin. 447 * It sits on top of the PROG_STATE_VAR parameters that are 448 * automatically updated from GL context state. 449 */ 450void 451fs_visitor::setup_builtin_uniform_values(ir_variable *ir) 452{ 453 const ir_state_slot *const slots = ir->state_slots; 454 assert(ir->state_slots != NULL); 455 456 for (unsigned int i = 0; i < ir->num_state_slots; i++) { 457 /* This state reference has already been setup by ir_to_mesa, but we'll 458 * get the same index back here. 459 */ 460 int index = _mesa_add_state_reference(this->fp->Base.Parameters, 461 (gl_state_index *)slots[i].tokens); 462 463 /* Add each of the unique swizzles of the element as a parameter. 464 * This'll end up matching the expected layout of the 465 * array/matrix/structure we're trying to fill in. 466 */ 467 int last_swiz = -1; 468 for (unsigned int j = 0; j < 4; j++) { 469 int swiz = GET_SWZ(slots[i].swizzle, j); 470 if (swiz == last_swiz) 471 break; 472 last_swiz = swiz; 473 474 c->prog_data.param_convert[c->prog_data.nr_params] = 475 PARAM_NO_CONVERT; 476 this->param_index[c->prog_data.nr_params] = index; 477 this->param_offset[c->prog_data.nr_params] = swiz; 478 c->prog_data.nr_params++; 479 } 480 } 481} 482 483fs_reg * 484fs_visitor::emit_fragcoord_interpolation(ir_variable *ir) 485{ 486 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 487 fs_reg wpos = *reg; 488 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo; 489 490 /* gl_FragCoord.x */ 491 if (ir->pixel_center_integer) { 492 emit(BRW_OPCODE_MOV, wpos, this->pixel_x); 493 } else { 494 emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f)); 495 } 496 wpos.reg_offset++; 497 498 /* gl_FragCoord.y */ 499 if (!flip && ir->pixel_center_integer) { 500 emit(BRW_OPCODE_MOV, wpos, this->pixel_y); 501 } else { 502 fs_reg pixel_y = this->pixel_y; 503 float offset = (ir->pixel_center_integer ? 0.0 : 0.5); 504 505 if (flip) { 506 pixel_y.negate = true; 507 offset += c->key.drawable_height - 1.0; 508 } 509 510 emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset)); 511 } 512 wpos.reg_offset++; 513 514 /* gl_FragCoord.z */ 515 if (intel->gen >= 6) { 516 emit(BRW_OPCODE_MOV, wpos, 517 fs_reg(brw_vec8_grf(c->source_depth_reg, 0))); 518 } else { 519 emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y, 520 interp_reg(FRAG_ATTRIB_WPOS, 2)); 521 } 522 wpos.reg_offset++; 523 524 /* gl_FragCoord.w: Already set up in emit_interpolation */ 525 emit(BRW_OPCODE_MOV, wpos, this->wpos_w); 526 527 return reg; 528} 529 530fs_reg * 531fs_visitor::emit_general_interpolation(ir_variable *ir) 532{ 533 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 534 /* Interpolation is always in floating point regs. */ 535 reg->type = BRW_REGISTER_TYPE_F; 536 fs_reg attr = *reg; 537 538 unsigned int array_elements; 539 const glsl_type *type; 540 541 if (ir->type->is_array()) { 542 array_elements = ir->type->length; 543 if (array_elements == 0) { 544 fail("dereferenced array '%s' has length 0\n", ir->name); 545 } 546 type = ir->type->fields.array; 547 } else { 548 array_elements = 1; 549 type = ir->type; 550 } 551 552 int location = ir->location; 553 for (unsigned int i = 0; i < array_elements; i++) { 554 for (unsigned int j = 0; j < type->matrix_columns; j++) { 555 if (urb_setup[location] == -1) { 556 /* If there's no incoming setup data for this slot, don't 557 * emit interpolation for it. 558 */ 559 attr.reg_offset += type->vector_elements; 560 location++; 561 continue; 562 } 563 564 bool is_gl_Color = 565 location == FRAG_ATTRIB_COL0 || location == FRAG_ATTRIB_COL1; 566 567 if (c->key.flat_shade && is_gl_Color) { 568 /* Constant interpolation (flat shading) case. The SF has 569 * handed us defined values in only the constant offset 570 * field of the setup reg. 571 */ 572 for (unsigned int k = 0; k < type->vector_elements; k++) { 573 struct brw_reg interp = interp_reg(location, k); 574 interp = suboffset(interp, 3); 575 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp)); 576 attr.reg_offset++; 577 } 578 } else { 579 /* Perspective interpolation case. */ 580 for (unsigned int k = 0; k < type->vector_elements; k++) { 581 struct brw_reg interp = interp_reg(location, k); 582 emit(FS_OPCODE_LINTERP, attr, 583 this->delta_x, this->delta_y, fs_reg(interp)); 584 attr.reg_offset++; 585 } 586 587 if (intel->gen < 6) { 588 attr.reg_offset -= type->vector_elements; 589 for (unsigned int k = 0; k < type->vector_elements; k++) { 590 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w); 591 attr.reg_offset++; 592 } 593 } 594 } 595 location++; 596 } 597 } 598 599 return reg; 600} 601 602fs_reg * 603fs_visitor::emit_frontfacing_interpolation(ir_variable *ir) 604{ 605 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 606 607 /* The frontfacing comes in as a bit in the thread payload. */ 608 if (intel->gen >= 6) { 609 emit(BRW_OPCODE_ASR, *reg, 610 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)), 611 fs_reg(15)); 612 emit(BRW_OPCODE_NOT, *reg, *reg); 613 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1)); 614 } else { 615 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); 616 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives 617 * us front face 618 */ 619 fs_inst *inst = emit(BRW_OPCODE_CMP, *reg, 620 fs_reg(r1_6ud), 621 fs_reg(1u << 31)); 622 inst->conditional_mod = BRW_CONDITIONAL_L; 623 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u)); 624 } 625 626 return reg; 627} 628 629fs_inst * 630fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src) 631{ 632 switch (opcode) { 633 case FS_OPCODE_RCP: 634 case FS_OPCODE_RSQ: 635 case FS_OPCODE_SQRT: 636 case FS_OPCODE_EXP2: 637 case FS_OPCODE_LOG2: 638 case FS_OPCODE_SIN: 639 case FS_OPCODE_COS: 640 break; 641 default: 642 assert(!"not reached: bad math opcode"); 643 return NULL; 644 } 645 646 /* Can't do hstride == 0 args to gen6 math, so expand it out. We 647 * might be able to do better by doing execsize = 1 math and then 648 * expanding that result out, but we would need to be careful with 649 * masking. 650 * 651 * The hardware ignores source modifiers (negate and abs) on math 652 * instructions, so we also move to a temp to set those up. 653 */ 654 if (intel->gen >= 6 && (src.file == UNIFORM || 655 src.abs || 656 src.negate)) { 657 fs_reg expanded = fs_reg(this, glsl_type::float_type); 658 emit(BRW_OPCODE_MOV, expanded, src); 659 src = expanded; 660 } 661 662 fs_inst *inst = emit(opcode, dst, src); 663 664 if (intel->gen < 6) { 665 inst->base_mrf = 2; 666 inst->mlen = c->dispatch_width / 8; 667 } 668 669 return inst; 670} 671 672fs_inst * 673fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1) 674{ 675 int base_mrf = 2; 676 fs_inst *inst; 677 678 assert(opcode == FS_OPCODE_POW); 679 680 if (intel->gen >= 6) { 681 /* Can't do hstride == 0 args to gen6 math, so expand it out. 682 * 683 * The hardware ignores source modifiers (negate and abs) on math 684 * instructions, so we also move to a temp to set those up. 685 */ 686 if (src0.file == UNIFORM || src0.abs || src0.negate) { 687 fs_reg expanded = fs_reg(this, glsl_type::float_type); 688 emit(BRW_OPCODE_MOV, expanded, src0); 689 src0 = expanded; 690 } 691 692 if (src1.file == UNIFORM || src1.abs || src1.negate) { 693 fs_reg expanded = fs_reg(this, glsl_type::float_type); 694 emit(BRW_OPCODE_MOV, expanded, src1); 695 src1 = expanded; 696 } 697 698 inst = emit(opcode, dst, src0, src1); 699 } else { 700 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1); 701 inst = emit(opcode, dst, src0, reg_null_f); 702 703 inst->base_mrf = base_mrf; 704 inst->mlen = 2 * c->dispatch_width / 8; 705 } 706 return inst; 707} 708 709void 710fs_visitor::visit(ir_variable *ir) 711{ 712 fs_reg *reg = NULL; 713 714 if (variable_storage(ir)) 715 return; 716 717 if (strcmp(ir->name, "gl_FragColor") == 0) { 718 this->frag_color = ir; 719 } else if (strcmp(ir->name, "gl_FragData") == 0) { 720 this->frag_data = ir; 721 } else if (strcmp(ir->name, "gl_FragDepth") == 0) { 722 this->frag_depth = ir; 723 } 724 725 if (ir->mode == ir_var_in) { 726 if (!strcmp(ir->name, "gl_FragCoord")) { 727 reg = emit_fragcoord_interpolation(ir); 728 } else if (!strcmp(ir->name, "gl_FrontFacing")) { 729 reg = emit_frontfacing_interpolation(ir); 730 } else { 731 reg = emit_general_interpolation(ir); 732 } 733 assert(reg); 734 hash_table_insert(this->variable_ht, reg, ir); 735 return; 736 } 737 738 if (ir->mode == ir_var_uniform) { 739 int param_index = c->prog_data.nr_params; 740 741 if (c->dispatch_width == 16) { 742 if (!variable_storage(ir)) { 743 fail("Failed to find uniform '%s' in 16-wide\n", ir->name); 744 } 745 return; 746 } 747 748 if (!strncmp(ir->name, "gl_", 3)) { 749 setup_builtin_uniform_values(ir); 750 } else { 751 setup_uniform_values(ir->location, ir->type); 752 } 753 754 reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index); 755 reg->type = brw_type_for_base_type(ir->type); 756 } 757 758 if (!reg) 759 reg = new(this->mem_ctx) fs_reg(this, ir->type); 760 761 hash_table_insert(this->variable_ht, reg, ir); 762} 763 764void 765fs_visitor::visit(ir_dereference_variable *ir) 766{ 767 fs_reg *reg = variable_storage(ir->var); 768 this->result = *reg; 769} 770 771void 772fs_visitor::visit(ir_dereference_record *ir) 773{ 774 const glsl_type *struct_type = ir->record->type; 775 776 ir->record->accept(this); 777 778 unsigned int offset = 0; 779 for (unsigned int i = 0; i < struct_type->length; i++) { 780 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0) 781 break; 782 offset += type_size(struct_type->fields.structure[i].type); 783 } 784 this->result.reg_offset += offset; 785 this->result.type = brw_type_for_base_type(ir->type); 786} 787 788void 789fs_visitor::visit(ir_dereference_array *ir) 790{ 791 ir_constant *index; 792 int element_size; 793 794 ir->array->accept(this); 795 index = ir->array_index->as_constant(); 796 797 element_size = type_size(ir->type); 798 this->result.type = brw_type_for_base_type(ir->type); 799 800 if (index) { 801 assert(this->result.file == UNIFORM || 802 (this->result.file == GRF && 803 this->result.reg != 0)); 804 this->result.reg_offset += index->value.i[0] * element_size; 805 } else { 806 assert(!"FINISHME: non-constant array element"); 807 } 808} 809 810/* Instruction selection: Produce a MOV.sat instead of 811 * MIN(MAX(val, 0), 1) when possible. 812 */ 813bool 814fs_visitor::try_emit_saturate(ir_expression *ir) 815{ 816 ir_rvalue *sat_val = ir->as_rvalue_to_saturate(); 817 818 if (!sat_val) 819 return false; 820 821 sat_val->accept(this); 822 fs_reg src = this->result; 823 824 this->result = fs_reg(this, ir->type); 825 fs_inst *inst = emit(BRW_OPCODE_MOV, this->result, src); 826 inst->saturate = true; 827 828 return true; 829} 830 831static uint32_t 832brw_conditional_for_comparison(unsigned int op) 833{ 834 switch (op) { 835 case ir_binop_less: 836 return BRW_CONDITIONAL_L; 837 case ir_binop_greater: 838 return BRW_CONDITIONAL_G; 839 case ir_binop_lequal: 840 return BRW_CONDITIONAL_LE; 841 case ir_binop_gequal: 842 return BRW_CONDITIONAL_GE; 843 case ir_binop_equal: 844 case ir_binop_all_equal: /* same as equal for scalars */ 845 return BRW_CONDITIONAL_Z; 846 case ir_binop_nequal: 847 case ir_binop_any_nequal: /* same as nequal for scalars */ 848 return BRW_CONDITIONAL_NZ; 849 default: 850 assert(!"not reached: bad operation for comparison"); 851 return BRW_CONDITIONAL_NZ; 852 } 853} 854 855void 856fs_visitor::visit(ir_expression *ir) 857{ 858 unsigned int operand; 859 fs_reg op[2], temp; 860 fs_inst *inst; 861 862 assert(ir->get_num_operands() <= 2); 863 864 if (try_emit_saturate(ir)) 865 return; 866 867 for (operand = 0; operand < ir->get_num_operands(); operand++) { 868 ir->operands[operand]->accept(this); 869 if (this->result.file == BAD_FILE) { 870 ir_print_visitor v; 871 fail("Failed to get tree for expression operand:\n"); 872 ir->operands[operand]->accept(&v); 873 } 874 op[operand] = this->result; 875 876 /* Matrix expression operands should have been broken down to vector 877 * operations already. 878 */ 879 assert(!ir->operands[operand]->type->is_matrix()); 880 /* And then those vector operands should have been broken down to scalar. 881 */ 882 assert(!ir->operands[operand]->type->is_vector()); 883 } 884 885 /* Storage for our result. If our result goes into an assignment, it will 886 * just get copy-propagated out, so no worries. 887 */ 888 this->result = fs_reg(this, ir->type); 889 890 switch (ir->operation) { 891 case ir_unop_logic_not: 892 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is 893 * ones complement of the whole register, not just bit 0. 894 */ 895 emit(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1)); 896 break; 897 case ir_unop_neg: 898 op[0].negate = !op[0].negate; 899 this->result = op[0]; 900 break; 901 case ir_unop_abs: 902 op[0].abs = true; 903 op[0].negate = false; 904 this->result = op[0]; 905 break; 906 case ir_unop_sign: 907 temp = fs_reg(this, ir->type); 908 909 emit(BRW_OPCODE_MOV, this->result, fs_reg(0.0f)); 910 911 inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f)); 912 inst->conditional_mod = BRW_CONDITIONAL_G; 913 inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(1.0f)); 914 inst->predicated = true; 915 916 inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f)); 917 inst->conditional_mod = BRW_CONDITIONAL_L; 918 inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f)); 919 inst->predicated = true; 920 921 break; 922 case ir_unop_rcp: 923 emit_math(FS_OPCODE_RCP, this->result, op[0]); 924 break; 925 926 case ir_unop_exp2: 927 emit_math(FS_OPCODE_EXP2, this->result, op[0]); 928 break; 929 case ir_unop_log2: 930 emit_math(FS_OPCODE_LOG2, this->result, op[0]); 931 break; 932 case ir_unop_exp: 933 case ir_unop_log: 934 assert(!"not reached: should be handled by ir_explog_to_explog2"); 935 break; 936 case ir_unop_sin: 937 case ir_unop_sin_reduced: 938 emit_math(FS_OPCODE_SIN, this->result, op[0]); 939 break; 940 case ir_unop_cos: 941 case ir_unop_cos_reduced: 942 emit_math(FS_OPCODE_COS, this->result, op[0]); 943 break; 944 945 case ir_unop_dFdx: 946 emit(FS_OPCODE_DDX, this->result, op[0]); 947 break; 948 case ir_unop_dFdy: 949 emit(FS_OPCODE_DDY, this->result, op[0]); 950 break; 951 952 case ir_binop_add: 953 emit(BRW_OPCODE_ADD, this->result, op[0], op[1]); 954 break; 955 case ir_binop_sub: 956 assert(!"not reached: should be handled by ir_sub_to_add_neg"); 957 break; 958 959 case ir_binop_mul: 960 emit(BRW_OPCODE_MUL, this->result, op[0], op[1]); 961 break; 962 case ir_binop_div: 963 assert(!"not reached: should be handled by ir_div_to_mul_rcp"); 964 break; 965 case ir_binop_mod: 966 assert(!"ir_binop_mod should have been converted to b * fract(a/b)"); 967 break; 968 969 case ir_binop_less: 970 case ir_binop_greater: 971 case ir_binop_lequal: 972 case ir_binop_gequal: 973 case ir_binop_equal: 974 case ir_binop_all_equal: 975 case ir_binop_nequal: 976 case ir_binop_any_nequal: 977 temp = this->result; 978 /* original gen4 does implicit conversion before comparison. */ 979 if (intel->gen < 5) 980 temp.type = op[0].type; 981 982 inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]); 983 inst->conditional_mod = brw_conditional_for_comparison(ir->operation); 984 emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)); 985 break; 986 987 case ir_binop_logic_xor: 988 emit(BRW_OPCODE_XOR, this->result, op[0], op[1]); 989 break; 990 991 case ir_binop_logic_or: 992 emit(BRW_OPCODE_OR, this->result, op[0], op[1]); 993 break; 994 995 case ir_binop_logic_and: 996 emit(BRW_OPCODE_AND, this->result, op[0], op[1]); 997 break; 998 999 case ir_binop_dot: 1000 case ir_unop_any: 1001 assert(!"not reached: should be handled by brw_fs_channel_expressions"); 1002 break; 1003 1004 case ir_unop_noise: 1005 assert(!"not reached: should be handled by lower_noise"); 1006 break; 1007 1008 case ir_quadop_vector: 1009 assert(!"not reached: should be handled by lower_quadop_vector"); 1010 break; 1011 1012 case ir_unop_sqrt: 1013 emit_math(FS_OPCODE_SQRT, this->result, op[0]); 1014 break; 1015 1016 case ir_unop_rsq: 1017 emit_math(FS_OPCODE_RSQ, this->result, op[0]); 1018 break; 1019 1020 case ir_unop_i2f: 1021 case ir_unop_b2f: 1022 case ir_unop_b2i: 1023 case ir_unop_f2i: 1024 emit(BRW_OPCODE_MOV, this->result, op[0]); 1025 break; 1026 case ir_unop_f2b: 1027 case ir_unop_i2b: 1028 temp = this->result; 1029 /* original gen4 does implicit conversion before comparison. */ 1030 if (intel->gen < 5) 1031 temp.type = op[0].type; 1032 1033 inst = emit(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f)); 1034 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1035 inst = emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1)); 1036 break; 1037 1038 case ir_unop_trunc: 1039 emit(BRW_OPCODE_RNDZ, this->result, op[0]); 1040 break; 1041 case ir_unop_ceil: 1042 op[0].negate = !op[0].negate; 1043 inst = emit(BRW_OPCODE_RNDD, this->result, op[0]); 1044 this->result.negate = true; 1045 break; 1046 case ir_unop_floor: 1047 inst = emit(BRW_OPCODE_RNDD, this->result, op[0]); 1048 break; 1049 case ir_unop_fract: 1050 inst = emit(BRW_OPCODE_FRC, this->result, op[0]); 1051 break; 1052 case ir_unop_round_even: 1053 emit(BRW_OPCODE_RNDE, this->result, op[0]); 1054 break; 1055 1056 case ir_binop_min: 1057 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]); 1058 inst->conditional_mod = BRW_CONDITIONAL_L; 1059 1060 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]); 1061 inst->predicated = true; 1062 break; 1063 case ir_binop_max: 1064 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]); 1065 inst->conditional_mod = BRW_CONDITIONAL_G; 1066 1067 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]); 1068 inst->predicated = true; 1069 break; 1070 1071 case ir_binop_pow: 1072 emit_math(FS_OPCODE_POW, this->result, op[0], op[1]); 1073 break; 1074 1075 case ir_unop_bit_not: 1076 inst = emit(BRW_OPCODE_NOT, this->result, op[0]); 1077 break; 1078 case ir_binop_bit_and: 1079 inst = emit(BRW_OPCODE_AND, this->result, op[0], op[1]); 1080 break; 1081 case ir_binop_bit_xor: 1082 inst = emit(BRW_OPCODE_XOR, this->result, op[0], op[1]); 1083 break; 1084 case ir_binop_bit_or: 1085 inst = emit(BRW_OPCODE_OR, this->result, op[0], op[1]); 1086 break; 1087 1088 case ir_unop_u2f: 1089 case ir_binop_lshift: 1090 case ir_binop_rshift: 1091 assert(!"GLSL 1.30 features unsupported"); 1092 break; 1093 } 1094} 1095 1096void 1097fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r, 1098 const glsl_type *type, bool predicated) 1099{ 1100 switch (type->base_type) { 1101 case GLSL_TYPE_FLOAT: 1102 case GLSL_TYPE_UINT: 1103 case GLSL_TYPE_INT: 1104 case GLSL_TYPE_BOOL: 1105 for (unsigned int i = 0; i < type->components(); i++) { 1106 l.type = brw_type_for_base_type(type); 1107 r.type = brw_type_for_base_type(type); 1108 1109 fs_inst *inst = emit(BRW_OPCODE_MOV, l, r); 1110 inst->predicated = predicated; 1111 1112 l.reg_offset++; 1113 r.reg_offset++; 1114 } 1115 break; 1116 case GLSL_TYPE_ARRAY: 1117 for (unsigned int i = 0; i < type->length; i++) { 1118 emit_assignment_writes(l, r, type->fields.array, predicated); 1119 } 1120 break; 1121 1122 case GLSL_TYPE_STRUCT: 1123 for (unsigned int i = 0; i < type->length; i++) { 1124 emit_assignment_writes(l, r, type->fields.structure[i].type, 1125 predicated); 1126 } 1127 break; 1128 1129 case GLSL_TYPE_SAMPLER: 1130 break; 1131 1132 default: 1133 assert(!"not reached"); 1134 break; 1135 } 1136} 1137 1138void 1139fs_visitor::visit(ir_assignment *ir) 1140{ 1141 struct fs_reg l, r; 1142 fs_inst *inst; 1143 1144 /* FINISHME: arrays on the lhs */ 1145 ir->lhs->accept(this); 1146 l = this->result; 1147 1148 ir->rhs->accept(this); 1149 r = this->result; 1150 1151 assert(l.file != BAD_FILE); 1152 assert(r.file != BAD_FILE); 1153 1154 if (ir->condition) { 1155 emit_bool_to_cond_code(ir->condition); 1156 } 1157 1158 if (ir->lhs->type->is_scalar() || 1159 ir->lhs->type->is_vector()) { 1160 for (int i = 0; i < ir->lhs->type->vector_elements; i++) { 1161 if (ir->write_mask & (1 << i)) { 1162 inst = emit(BRW_OPCODE_MOV, l, r); 1163 if (ir->condition) 1164 inst->predicated = true; 1165 r.reg_offset++; 1166 } 1167 l.reg_offset++; 1168 } 1169 } else { 1170 emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL); 1171 } 1172} 1173 1174fs_inst * 1175fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate, 1176 int sampler) 1177{ 1178 int mlen; 1179 int base_mrf = 1; 1180 bool simd16 = false; 1181 fs_reg orig_dst; 1182 1183 /* g0 header. */ 1184 mlen = 1; 1185 1186 if (ir->shadow_comparitor) { 1187 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1188 fs_inst *inst = emit(BRW_OPCODE_MOV, 1189 fs_reg(MRF, base_mrf + mlen + i), coordinate); 1190 if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler)) 1191 inst->saturate = true; 1192 1193 coordinate.reg_offset++; 1194 } 1195 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */ 1196 mlen += 3; 1197 1198 if (ir->op == ir_tex) { 1199 /* There's no plain shadow compare message, so we use shadow 1200 * compare with a bias of 0.0. 1201 */ 1202 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)); 1203 mlen++; 1204 } else if (ir->op == ir_txb) { 1205 ir->lod_info.bias->accept(this); 1206 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1207 mlen++; 1208 } else { 1209 assert(ir->op == ir_txl); 1210 ir->lod_info.lod->accept(this); 1211 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1212 mlen++; 1213 } 1214 1215 ir->shadow_comparitor->accept(this); 1216 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1217 mlen++; 1218 } else if (ir->op == ir_tex) { 1219 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1220 fs_inst *inst = emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), 1221 coordinate); 1222 if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler)) 1223 inst->saturate = true; 1224 coordinate.reg_offset++; 1225 } 1226 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */ 1227 mlen += 3; 1228 } else if (ir->op == ir_txd) { 1229 assert(!"TXD isn't supported on gen4 yet."); 1230 } else { 1231 /* Oh joy. gen4 doesn't have SIMD8 non-shadow-compare bias/lod 1232 * instructions. We'll need to do SIMD16 here. 1233 */ 1234 assert(ir->op == ir_txb || ir->op == ir_txl); 1235 1236 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1237 fs_inst *inst = emit(BRW_OPCODE_MOV, fs_reg(MRF, 1238 base_mrf + mlen + i * 2), 1239 coordinate); 1240 if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler)) 1241 inst->saturate = true; 1242 coordinate.reg_offset++; 1243 } 1244 1245 /* lod/bias appears after u/v/r. */ 1246 mlen += 6; 1247 1248 if (ir->op == ir_txb) { 1249 ir->lod_info.bias->accept(this); 1250 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1251 mlen++; 1252 } else { 1253 ir->lod_info.lod->accept(this); 1254 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1255 mlen++; 1256 } 1257 1258 /* The unused upper half. */ 1259 mlen++; 1260 1261 /* Now, since we're doing simd16, the return is 2 interleaved 1262 * vec4s where the odd-indexed ones are junk. We'll need to move 1263 * this weirdness around to the expected layout. 1264 */ 1265 simd16 = true; 1266 orig_dst = dst; 1267 dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 1268 2)); 1269 dst.type = BRW_REGISTER_TYPE_F; 1270 } 1271 1272 fs_inst *inst = NULL; 1273 switch (ir->op) { 1274 case ir_tex: 1275 inst = emit(FS_OPCODE_TEX, dst); 1276 break; 1277 case ir_txb: 1278 inst = emit(FS_OPCODE_TXB, dst); 1279 break; 1280 case ir_txl: 1281 inst = emit(FS_OPCODE_TXL, dst); 1282 break; 1283 case ir_txd: 1284 inst = emit(FS_OPCODE_TXD, dst); 1285 break; 1286 case ir_txf: 1287 assert(!"GLSL 1.30 features unsupported"); 1288 break; 1289 } 1290 inst->base_mrf = base_mrf; 1291 inst->mlen = mlen; 1292 inst->header_present = true; 1293 1294 if (simd16) { 1295 for (int i = 0; i < 4; i++) { 1296 emit(BRW_OPCODE_MOV, orig_dst, dst); 1297 orig_dst.reg_offset++; 1298 dst.reg_offset += 2; 1299 } 1300 } 1301 1302 return inst; 1303} 1304 1305/* gen5's sampler has slots for u, v, r, array index, then optional 1306 * parameters like shadow comparitor or LOD bias. If optional 1307 * parameters aren't present, those base slots are optional and don't 1308 * need to be included in the message. 1309 * 1310 * We don't fill in the unnecessary slots regardless, which may look 1311 * surprising in the disassembly. 1312 */ 1313fs_inst * 1314fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate, 1315 int sampler) 1316{ 1317 int mlen = 0; 1318 int base_mrf = 2; 1319 int reg_width = c->dispatch_width / 8; 1320 bool header_present = false; 1321 1322 if (ir->offset) { 1323 /* The offsets set up by the ir_texture visitor are in the 1324 * m1 header, so we can't go headerless. 1325 */ 1326 header_present = true; 1327 mlen++; 1328 base_mrf--; 1329 } 1330 1331 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1332 fs_inst *inst = emit(BRW_OPCODE_MOV, 1333 fs_reg(MRF, base_mrf + mlen + i * reg_width), 1334 coordinate); 1335 if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler)) 1336 inst->saturate = true; 1337 coordinate.reg_offset++; 1338 } 1339 mlen += ir->coordinate->type->vector_elements * reg_width; 1340 1341 if (ir->shadow_comparitor) { 1342 mlen = MAX2(mlen, header_present + 4 * reg_width); 1343 1344 ir->shadow_comparitor->accept(this); 1345 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1346 mlen += reg_width; 1347 } 1348 1349 fs_inst *inst = NULL; 1350 switch (ir->op) { 1351 case ir_tex: 1352 inst = emit(FS_OPCODE_TEX, dst); 1353 break; 1354 case ir_txb: 1355 ir->lod_info.bias->accept(this); 1356 mlen = MAX2(mlen, header_present + 4 * reg_width); 1357 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1358 mlen += reg_width; 1359 1360 inst = emit(FS_OPCODE_TXB, dst); 1361 1362 break; 1363 case ir_txl: 1364 ir->lod_info.lod->accept(this); 1365 mlen = MAX2(mlen, header_present + 4 * reg_width); 1366 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1367 mlen += reg_width; 1368 1369 inst = emit(FS_OPCODE_TXL, dst); 1370 break; 1371 case ir_txd: 1372 case ir_txf: 1373 assert(!"GLSL 1.30 features unsupported"); 1374 break; 1375 } 1376 inst->base_mrf = base_mrf; 1377 inst->mlen = mlen; 1378 inst->header_present = header_present; 1379 1380 if (mlen > 11) { 1381 fail("Message length >11 disallowed by hardware\n"); 1382 } 1383 1384 return inst; 1385} 1386 1387fs_inst * 1388fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate, 1389 int sampler) 1390{ 1391 int mlen = 0; 1392 int base_mrf = 2; 1393 int reg_width = c->dispatch_width / 8; 1394 bool header_present = false; 1395 1396 if (ir->offset) { 1397 /* The offsets set up by the ir_texture visitor are in the 1398 * m1 header, so we can't go headerless. 1399 */ 1400 header_present = true; 1401 mlen++; 1402 base_mrf--; 1403 } 1404 1405 if (ir->shadow_comparitor) { 1406 ir->shadow_comparitor->accept(this); 1407 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1408 mlen += reg_width; 1409 } 1410 1411 /* Set up the LOD info */ 1412 switch (ir->op) { 1413 case ir_tex: 1414 break; 1415 case ir_txb: 1416 ir->lod_info.bias->accept(this); 1417 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1418 mlen += reg_width; 1419 break; 1420 case ir_txl: 1421 ir->lod_info.lod->accept(this); 1422 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1423 mlen += reg_width; 1424 break; 1425 case ir_txd: 1426 case ir_txf: 1427 assert(!"GLSL 1.30 features unsupported"); 1428 break; 1429 } 1430 1431 /* Set up the coordinate */ 1432 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1433 fs_inst *inst = emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), 1434 coordinate); 1435 if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler)) 1436 inst->saturate = true; 1437 coordinate.reg_offset++; 1438 mlen += reg_width; 1439 } 1440 1441 /* Generate the SEND */ 1442 fs_inst *inst = NULL; 1443 switch (ir->op) { 1444 case ir_tex: inst = emit(FS_OPCODE_TEX, dst); break; 1445 case ir_txb: inst = emit(FS_OPCODE_TXB, dst); break; 1446 case ir_txl: inst = emit(FS_OPCODE_TXL, dst); break; 1447 case ir_txd: inst = emit(FS_OPCODE_TXD, dst); break; 1448 case ir_txf: assert(!"TXF unsupported."); 1449 } 1450 inst->base_mrf = base_mrf; 1451 inst->mlen = mlen; 1452 inst->header_present = header_present; 1453 1454 if (mlen > 11) { 1455 fail("Message length >11 disallowed by hardware\n"); 1456 } 1457 1458 return inst; 1459} 1460 1461void 1462fs_visitor::visit(ir_texture *ir) 1463{ 1464 int sampler; 1465 fs_inst *inst = NULL; 1466 1467 ir->coordinate->accept(this); 1468 fs_reg coordinate = this->result; 1469 1470 if (ir->offset != NULL) { 1471 ir_constant *offset = ir->offset->as_constant(); 1472 assert(offset != NULL); 1473 1474 signed char offsets[3]; 1475 for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) 1476 offsets[i] = (signed char) offset->value.i[i]; 1477 1478 /* Combine all three offsets into a single unsigned dword: 1479 * 1480 * bits 11:8 - U Offset (X component) 1481 * bits 7:4 - V Offset (Y component) 1482 * bits 3:0 - R Offset (Z component) 1483 */ 1484 unsigned offset_bits = 0; 1485 for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) { 1486 const unsigned shift = 4 * (2 - i); 1487 offset_bits |= (offsets[i] << shift) & (0xF << shift); 1488 } 1489 1490 /* Explicitly set up the message header by copying g0 to msg reg m1. */ 1491 emit(BRW_OPCODE_MOV, fs_reg(MRF, 1, BRW_REGISTER_TYPE_UD), 1492 fs_reg(GRF, 0, BRW_REGISTER_TYPE_UD)); 1493 1494 /* Then set the offset bits in DWord 2 of the message header. */ 1495 emit(BRW_OPCODE_MOV, 1496 fs_reg(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 1, 2), 1497 BRW_REGISTER_TYPE_UD)), 1498 fs_reg(brw_imm_uw(offset_bits))); 1499 } 1500 1501 /* Should be lowered by do_lower_texture_projection */ 1502 assert(!ir->projector); 1503 1504 sampler = _mesa_get_sampler_uniform_value(ir->sampler, 1505 ctx->Shader.CurrentFragmentProgram, 1506 &brw->fragment_program->Base); 1507 sampler = c->fp->program.Base.SamplerUnits[sampler]; 1508 1509 /* The 965 requires the EU to do the normalization of GL rectangle 1510 * texture coordinates. We use the program parameter state 1511 * tracking to get the scaling factor. 1512 */ 1513 if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) { 1514 struct gl_program_parameter_list *params = c->fp->program.Base.Parameters; 1515 int tokens[STATE_LENGTH] = { 1516 STATE_INTERNAL, 1517 STATE_TEXRECT_SCALE, 1518 sampler, 1519 0, 1520 0 1521 }; 1522 1523 if (c->dispatch_width == 16) { 1524 fail("rectangle scale uniform setup not supported on 16-wide\n"); 1525 this->result = fs_reg(this, ir->type); 1526 return; 1527 } 1528 1529 c->prog_data.param_convert[c->prog_data.nr_params] = 1530 PARAM_NO_CONVERT; 1531 c->prog_data.param_convert[c->prog_data.nr_params + 1] = 1532 PARAM_NO_CONVERT; 1533 1534 fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params); 1535 fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1); 1536 GLuint index = _mesa_add_state_reference(params, 1537 (gl_state_index *)tokens); 1538 1539 this->param_index[c->prog_data.nr_params] = index; 1540 this->param_offset[c->prog_data.nr_params] = 0; 1541 c->prog_data.nr_params++; 1542 this->param_index[c->prog_data.nr_params] = index; 1543 this->param_offset[c->prog_data.nr_params] = 1; 1544 c->prog_data.nr_params++; 1545 1546 fs_reg dst = fs_reg(this, ir->coordinate->type); 1547 fs_reg src = coordinate; 1548 coordinate = dst; 1549 1550 emit(BRW_OPCODE_MUL, dst, src, scale_x); 1551 dst.reg_offset++; 1552 src.reg_offset++; 1553 emit(BRW_OPCODE_MUL, dst, src, scale_y); 1554 } 1555 1556 /* Writemasking doesn't eliminate channels on SIMD8 texture 1557 * samples, so don't worry about them. 1558 */ 1559 fs_reg dst = fs_reg(this, glsl_type::vec4_type); 1560 1561 if (intel->gen >= 7) { 1562 inst = emit_texture_gen7(ir, dst, coordinate, sampler); 1563 } else if (intel->gen >= 5) { 1564 inst = emit_texture_gen5(ir, dst, coordinate, sampler); 1565 } else { 1566 inst = emit_texture_gen4(ir, dst, coordinate, sampler); 1567 } 1568 1569 /* If there's an offset, we already set up m1. To avoid the implied move, 1570 * use the null register. Otherwise, we want an implied move from g0. 1571 */ 1572 if (ir->offset != NULL || !inst->header_present) 1573 inst->src[0] = fs_reg(brw_null_reg()); 1574 else 1575 inst->src[0] = fs_reg(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW)); 1576 1577 inst->sampler = sampler; 1578 1579 this->result = dst; 1580 1581 if (ir->shadow_comparitor) 1582 inst->shadow_compare = true; 1583 1584 if (ir->type == glsl_type::float_type) { 1585 /* Ignore DEPTH_TEXTURE_MODE swizzling. */ 1586 assert(ir->sampler->type->sampler_shadow); 1587 } else if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) { 1588 fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type); 1589 1590 for (int i = 0; i < 4; i++) { 1591 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i); 1592 fs_reg l = swizzle_dst; 1593 l.reg_offset += i; 1594 1595 if (swiz == SWIZZLE_ZERO) { 1596 emit(BRW_OPCODE_MOV, l, fs_reg(0.0f)); 1597 } else if (swiz == SWIZZLE_ONE) { 1598 emit(BRW_OPCODE_MOV, l, fs_reg(1.0f)); 1599 } else { 1600 fs_reg r = dst; 1601 r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i); 1602 emit(BRW_OPCODE_MOV, l, r); 1603 } 1604 } 1605 this->result = swizzle_dst; 1606 } 1607} 1608 1609void 1610fs_visitor::visit(ir_swizzle *ir) 1611{ 1612 ir->val->accept(this); 1613 fs_reg val = this->result; 1614 1615 if (ir->type->vector_elements == 1) { 1616 this->result.reg_offset += ir->mask.x; 1617 return; 1618 } 1619 1620 fs_reg result = fs_reg(this, ir->type); 1621 this->result = result; 1622 1623 for (unsigned int i = 0; i < ir->type->vector_elements; i++) { 1624 fs_reg channel = val; 1625 int swiz = 0; 1626 1627 switch (i) { 1628 case 0: 1629 swiz = ir->mask.x; 1630 break; 1631 case 1: 1632 swiz = ir->mask.y; 1633 break; 1634 case 2: 1635 swiz = ir->mask.z; 1636 break; 1637 case 3: 1638 swiz = ir->mask.w; 1639 break; 1640 } 1641 1642 channel.reg_offset += swiz; 1643 emit(BRW_OPCODE_MOV, result, channel); 1644 result.reg_offset++; 1645 } 1646} 1647 1648void 1649fs_visitor::visit(ir_discard *ir) 1650{ 1651 assert(ir->condition == NULL); /* FINISHME */ 1652 1653 emit(FS_OPCODE_DISCARD); 1654 kill_emitted = true; 1655} 1656 1657void 1658fs_visitor::visit(ir_constant *ir) 1659{ 1660 /* Set this->result to reg at the bottom of the function because some code 1661 * paths will cause this visitor to be applied to other fields. This will 1662 * cause the value stored in this->result to be modified. 1663 * 1664 * Make reg constant so that it doesn't get accidentally modified along the 1665 * way. Yes, I actually had this problem. :( 1666 */ 1667 const fs_reg reg(this, ir->type); 1668 fs_reg dst_reg = reg; 1669 1670 if (ir->type->is_array()) { 1671 const unsigned size = type_size(ir->type->fields.array); 1672 1673 for (unsigned i = 0; i < ir->type->length; i++) { 1674 ir->array_elements[i]->accept(this); 1675 fs_reg src_reg = this->result; 1676 1677 dst_reg.type = src_reg.type; 1678 for (unsigned j = 0; j < size; j++) { 1679 emit(BRW_OPCODE_MOV, dst_reg, src_reg); 1680 src_reg.reg_offset++; 1681 dst_reg.reg_offset++; 1682 } 1683 } 1684 } else if (ir->type->is_record()) { 1685 foreach_list(node, &ir->components) { 1686 ir_instruction *const field = (ir_instruction *) node; 1687 const unsigned size = type_size(field->type); 1688 1689 field->accept(this); 1690 fs_reg src_reg = this->result; 1691 1692 dst_reg.type = src_reg.type; 1693 for (unsigned j = 0; j < size; j++) { 1694 emit(BRW_OPCODE_MOV, dst_reg, src_reg); 1695 src_reg.reg_offset++; 1696 dst_reg.reg_offset++; 1697 } 1698 } 1699 } else { 1700 const unsigned size = type_size(ir->type); 1701 1702 for (unsigned i = 0; i < size; i++) { 1703 switch (ir->type->base_type) { 1704 case GLSL_TYPE_FLOAT: 1705 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i])); 1706 break; 1707 case GLSL_TYPE_UINT: 1708 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i])); 1709 break; 1710 case GLSL_TYPE_INT: 1711 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i])); 1712 break; 1713 case GLSL_TYPE_BOOL: 1714 emit(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i])); 1715 break; 1716 default: 1717 assert(!"Non-float/uint/int/bool constant"); 1718 } 1719 dst_reg.reg_offset++; 1720 } 1721 } 1722 1723 this->result = reg; 1724} 1725 1726void 1727fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir) 1728{ 1729 ir_expression *expr = ir->as_expression(); 1730 1731 if (expr) { 1732 fs_reg op[2]; 1733 fs_inst *inst; 1734 1735 assert(expr->get_num_operands() <= 2); 1736 for (unsigned int i = 0; i < expr->get_num_operands(); i++) { 1737 assert(expr->operands[i]->type->is_scalar()); 1738 1739 expr->operands[i]->accept(this); 1740 op[i] = this->result; 1741 } 1742 1743 switch (expr->operation) { 1744 case ir_unop_logic_not: 1745 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1)); 1746 inst->conditional_mod = BRW_CONDITIONAL_Z; 1747 break; 1748 1749 case ir_binop_logic_xor: 1750 inst = emit(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]); 1751 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1752 break; 1753 1754 case ir_binop_logic_or: 1755 inst = emit(BRW_OPCODE_OR, reg_null_d, op[0], op[1]); 1756 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1757 break; 1758 1759 case ir_binop_logic_and: 1760 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], op[1]); 1761 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1762 break; 1763 1764 case ir_unop_f2b: 1765 if (intel->gen >= 6) { 1766 inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f)); 1767 } else { 1768 inst = emit(BRW_OPCODE_MOV, reg_null_f, op[0]); 1769 } 1770 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1771 break; 1772 1773 case ir_unop_i2b: 1774 if (intel->gen >= 6) { 1775 inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0)); 1776 } else { 1777 inst = emit(BRW_OPCODE_MOV, reg_null_d, op[0]); 1778 } 1779 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1780 break; 1781 1782 case ir_binop_greater: 1783 case ir_binop_gequal: 1784 case ir_binop_less: 1785 case ir_binop_lequal: 1786 case ir_binop_equal: 1787 case ir_binop_all_equal: 1788 case ir_binop_nequal: 1789 case ir_binop_any_nequal: 1790 inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]); 1791 inst->conditional_mod = 1792 brw_conditional_for_comparison(expr->operation); 1793 break; 1794 1795 default: 1796 assert(!"not reached"); 1797 fail("bad cond code\n"); 1798 break; 1799 } 1800 return; 1801 } 1802 1803 ir->accept(this); 1804 1805 if (intel->gen >= 6) { 1806 fs_inst *inst = emit(BRW_OPCODE_AND, reg_null_d, this->result, fs_reg(1)); 1807 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1808 } else { 1809 fs_inst *inst = emit(BRW_OPCODE_MOV, reg_null_d, this->result); 1810 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1811 } 1812} 1813 1814/** 1815 * Emit a gen6 IF statement with the comparison folded into the IF 1816 * instruction. 1817 */ 1818void 1819fs_visitor::emit_if_gen6(ir_if *ir) 1820{ 1821 ir_expression *expr = ir->condition->as_expression(); 1822 1823 if (expr) { 1824 fs_reg op[2]; 1825 fs_inst *inst; 1826 fs_reg temp; 1827 1828 assert(expr->get_num_operands() <= 2); 1829 for (unsigned int i = 0; i < expr->get_num_operands(); i++) { 1830 assert(expr->operands[i]->type->is_scalar()); 1831 1832 expr->operands[i]->accept(this); 1833 op[i] = this->result; 1834 } 1835 1836 switch (expr->operation) { 1837 case ir_unop_logic_not: 1838 inst = emit(BRW_OPCODE_IF, temp, op[0], fs_reg(0)); 1839 inst->conditional_mod = BRW_CONDITIONAL_Z; 1840 return; 1841 1842 case ir_binop_logic_xor: 1843 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]); 1844 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1845 return; 1846 1847 case ir_binop_logic_or: 1848 temp = fs_reg(this, glsl_type::bool_type); 1849 emit(BRW_OPCODE_OR, temp, op[0], op[1]); 1850 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0)); 1851 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1852 return; 1853 1854 case ir_binop_logic_and: 1855 temp = fs_reg(this, glsl_type::bool_type); 1856 emit(BRW_OPCODE_AND, temp, op[0], op[1]); 1857 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0)); 1858 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1859 return; 1860 1861 case ir_unop_f2b: 1862 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0)); 1863 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1864 return; 1865 1866 case ir_unop_i2b: 1867 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0)); 1868 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1869 return; 1870 1871 case ir_binop_greater: 1872 case ir_binop_gequal: 1873 case ir_binop_less: 1874 case ir_binop_lequal: 1875 case ir_binop_equal: 1876 case ir_binop_all_equal: 1877 case ir_binop_nequal: 1878 case ir_binop_any_nequal: 1879 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]); 1880 inst->conditional_mod = 1881 brw_conditional_for_comparison(expr->operation); 1882 return; 1883 default: 1884 assert(!"not reached"); 1885 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0)); 1886 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1887 fail("bad condition\n"); 1888 return; 1889 } 1890 return; 1891 } 1892 1893 ir->condition->accept(this); 1894 1895 fs_inst *inst = emit(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0)); 1896 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1897} 1898 1899void 1900fs_visitor::visit(ir_if *ir) 1901{ 1902 fs_inst *inst; 1903 1904 if (intel->gen != 6 && c->dispatch_width == 16) { 1905 fail("Can't support (non-uniform) control flow on 16-wide\n"); 1906 } 1907 1908 /* Don't point the annotation at the if statement, because then it plus 1909 * the then and else blocks get printed. 1910 */ 1911 this->base_ir = ir->condition; 1912 1913 if (intel->gen == 6) { 1914 emit_if_gen6(ir); 1915 } else { 1916 emit_bool_to_cond_code(ir->condition); 1917 1918 inst = emit(BRW_OPCODE_IF); 1919 inst->predicated = true; 1920 } 1921 1922 foreach_iter(exec_list_iterator, iter, ir->then_instructions) { 1923 ir_instruction *ir = (ir_instruction *)iter.get(); 1924 this->base_ir = ir; 1925 1926 ir->accept(this); 1927 } 1928 1929 if (!ir->else_instructions.is_empty()) { 1930 emit(BRW_OPCODE_ELSE); 1931 1932 foreach_iter(exec_list_iterator, iter, ir->else_instructions) { 1933 ir_instruction *ir = (ir_instruction *)iter.get(); 1934 this->base_ir = ir; 1935 1936 ir->accept(this); 1937 } 1938 } 1939 1940 emit(BRW_OPCODE_ENDIF); 1941} 1942 1943void 1944fs_visitor::visit(ir_loop *ir) 1945{ 1946 fs_reg counter = reg_undef; 1947 1948 if (c->dispatch_width == 16) { 1949 fail("Can't support (non-uniform) control flow on 16-wide\n"); 1950 } 1951 1952 if (ir->counter) { 1953 this->base_ir = ir->counter; 1954 ir->counter->accept(this); 1955 counter = *(variable_storage(ir->counter)); 1956 1957 if (ir->from) { 1958 this->base_ir = ir->from; 1959 ir->from->accept(this); 1960 1961 emit(BRW_OPCODE_MOV, counter, this->result); 1962 } 1963 } 1964 1965 emit(BRW_OPCODE_DO); 1966 1967 if (ir->to) { 1968 this->base_ir = ir->to; 1969 ir->to->accept(this); 1970 1971 fs_inst *inst = emit(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result); 1972 inst->conditional_mod = brw_conditional_for_comparison(ir->cmp); 1973 1974 inst = emit(BRW_OPCODE_BREAK); 1975 inst->predicated = true; 1976 } 1977 1978 foreach_iter(exec_list_iterator, iter, ir->body_instructions) { 1979 ir_instruction *ir = (ir_instruction *)iter.get(); 1980 1981 this->base_ir = ir; 1982 ir->accept(this); 1983 } 1984 1985 if (ir->increment) { 1986 this->base_ir = ir->increment; 1987 ir->increment->accept(this); 1988 emit(BRW_OPCODE_ADD, counter, counter, this->result); 1989 } 1990 1991 emit(BRW_OPCODE_WHILE); 1992} 1993 1994void 1995fs_visitor::visit(ir_loop_jump *ir) 1996{ 1997 switch (ir->mode) { 1998 case ir_loop_jump::jump_break: 1999 emit(BRW_OPCODE_BREAK); 2000 break; 2001 case ir_loop_jump::jump_continue: 2002 emit(BRW_OPCODE_CONTINUE); 2003 break; 2004 } 2005} 2006 2007void 2008fs_visitor::visit(ir_call *ir) 2009{ 2010 assert(!"FINISHME"); 2011} 2012 2013void 2014fs_visitor::visit(ir_return *ir) 2015{ 2016 assert(!"FINISHME"); 2017} 2018 2019void 2020fs_visitor::visit(ir_function *ir) 2021{ 2022 /* Ignore function bodies other than main() -- we shouldn't see calls to 2023 * them since they should all be inlined before we get to ir_to_mesa. 2024 */ 2025 if (strcmp(ir->name, "main") == 0) { 2026 const ir_function_signature *sig; 2027 exec_list empty; 2028 2029 sig = ir->matching_signature(&empty); 2030 2031 assert(sig); 2032 2033 foreach_iter(exec_list_iterator, iter, sig->body) { 2034 ir_instruction *ir = (ir_instruction *)iter.get(); 2035 this->base_ir = ir; 2036 2037 ir->accept(this); 2038 } 2039 } 2040} 2041 2042void 2043fs_visitor::visit(ir_function_signature *ir) 2044{ 2045 assert(!"not reached"); 2046 (void)ir; 2047} 2048 2049fs_inst * 2050fs_visitor::emit(fs_inst inst) 2051{ 2052 fs_inst *list_inst = new(mem_ctx) fs_inst; 2053 *list_inst = inst; 2054 2055 if (force_uncompressed_stack > 0) 2056 list_inst->force_uncompressed = true; 2057 else if (force_sechalf_stack > 0) 2058 list_inst->force_sechalf = true; 2059 2060 list_inst->annotation = this->current_annotation; 2061 list_inst->ir = this->base_ir; 2062 2063 this->instructions.push_tail(list_inst); 2064 2065 return list_inst; 2066} 2067 2068/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */ 2069void 2070fs_visitor::emit_dummy_fs() 2071{ 2072 /* Everyone's favorite color. */ 2073 emit(BRW_OPCODE_MOV, fs_reg(MRF, 2), fs_reg(1.0f)); 2074 emit(BRW_OPCODE_MOV, fs_reg(MRF, 3), fs_reg(0.0f)); 2075 emit(BRW_OPCODE_MOV, fs_reg(MRF, 4), fs_reg(1.0f)); 2076 emit(BRW_OPCODE_MOV, fs_reg(MRF, 5), fs_reg(0.0f)); 2077 2078 fs_inst *write; 2079 write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0)); 2080 write->base_mrf = 0; 2081} 2082 2083/* The register location here is relative to the start of the URB 2084 * data. It will get adjusted to be a real location before 2085 * generate_code() time. 2086 */ 2087struct brw_reg 2088fs_visitor::interp_reg(int location, int channel) 2089{ 2090 int regnr = urb_setup[location] * 2 + channel / 2; 2091 int stride = (channel & 1) * 4; 2092 2093 assert(urb_setup[location] != -1); 2094 2095 return brw_vec1_grf(regnr, stride); 2096} 2097 2098/** Emits the interpolation for the varying inputs. */ 2099void 2100fs_visitor::emit_interpolation_setup_gen4() 2101{ 2102 this->current_annotation = "compute pixel centers"; 2103 this->pixel_x = fs_reg(this, glsl_type::uint_type); 2104 this->pixel_y = fs_reg(this, glsl_type::uint_type); 2105 this->pixel_x.type = BRW_REGISTER_TYPE_UW; 2106 this->pixel_y.type = BRW_REGISTER_TYPE_UW; 2107 2108 emit(FS_OPCODE_PIXEL_X, this->pixel_x); 2109 emit(FS_OPCODE_PIXEL_Y, this->pixel_y); 2110 2111 this->current_annotation = "compute pixel deltas from v0"; 2112 if (brw->has_pln) { 2113 this->delta_x = fs_reg(this, glsl_type::vec2_type); 2114 this->delta_y = this->delta_x; 2115 this->delta_y.reg_offset++; 2116 } else { 2117 this->delta_x = fs_reg(this, glsl_type::float_type); 2118 this->delta_y = fs_reg(this, glsl_type::float_type); 2119 } 2120 emit(BRW_OPCODE_ADD, this->delta_x, 2121 this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0)))); 2122 emit(BRW_OPCODE_ADD, this->delta_y, 2123 this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1)))); 2124 2125 this->current_annotation = "compute pos.w and 1/pos.w"; 2126 /* Compute wpos.w. It's always in our setup, since it's needed to 2127 * interpolate the other attributes. 2128 */ 2129 this->wpos_w = fs_reg(this, glsl_type::float_type); 2130 emit(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y, 2131 interp_reg(FRAG_ATTRIB_WPOS, 3)); 2132 /* Compute the pixel 1/W value from wpos.w. */ 2133 this->pixel_w = fs_reg(this, glsl_type::float_type); 2134 emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w); 2135 this->current_annotation = NULL; 2136} 2137 2138/** Emits the interpolation for the varying inputs. */ 2139void 2140fs_visitor::emit_interpolation_setup_gen6() 2141{ 2142 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW); 2143 2144 /* If the pixel centers end up used, the setup is the same as for gen4. */ 2145 this->current_annotation = "compute pixel centers"; 2146 fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type); 2147 fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type); 2148 int_pixel_x.type = BRW_REGISTER_TYPE_UW; 2149 int_pixel_y.type = BRW_REGISTER_TYPE_UW; 2150 emit(BRW_OPCODE_ADD, 2151 int_pixel_x, 2152 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)), 2153 fs_reg(brw_imm_v(0x10101010))); 2154 emit(BRW_OPCODE_ADD, 2155 int_pixel_y, 2156 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)), 2157 fs_reg(brw_imm_v(0x11001100))); 2158 2159 /* As of gen6, we can no longer mix float and int sources. We have 2160 * to turn the integer pixel centers into floats for their actual 2161 * use. 2162 */ 2163 this->pixel_x = fs_reg(this, glsl_type::float_type); 2164 this->pixel_y = fs_reg(this, glsl_type::float_type); 2165 emit(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x); 2166 emit(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y); 2167 2168 this->current_annotation = "compute pos.w"; 2169 this->pixel_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0)); 2170 this->wpos_w = fs_reg(this, glsl_type::float_type); 2171 emit_math(FS_OPCODE_RCP, this->wpos_w, this->pixel_w); 2172 2173 this->delta_x = fs_reg(brw_vec8_grf(2, 0)); 2174 this->delta_y = fs_reg(brw_vec8_grf(3, 0)); 2175 2176 this->current_annotation = NULL; 2177} 2178 2179void 2180fs_visitor::emit_color_write(int index, int first_color_mrf, fs_reg color) 2181{ 2182 int reg_width = c->dispatch_width / 8; 2183 2184 if (c->dispatch_width == 8 || intel->gen == 6) { 2185 /* SIMD8 write looks like: 2186 * m + 0: r0 2187 * m + 1: r1 2188 * m + 2: g0 2189 * m + 3: g1 2190 * 2191 * gen6 SIMD16 DP write looks like: 2192 * m + 0: r0 2193 * m + 1: r1 2194 * m + 2: g0 2195 * m + 3: g1 2196 * m + 4: b0 2197 * m + 5: b1 2198 * m + 6: a0 2199 * m + 7: a1 2200 */ 2201 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index * reg_width), 2202 color); 2203 } else { 2204 /* pre-gen6 SIMD16 single source DP write looks like: 2205 * m + 0: r0 2206 * m + 1: g0 2207 * m + 2: b0 2208 * m + 3: a0 2209 * m + 4: r1 2210 * m + 5: g1 2211 * m + 6: b1 2212 * m + 7: a1 2213 */ 2214 if (brw->has_compr4) { 2215 /* By setting the high bit of the MRF register number, we 2216 * indicate that we want COMPR4 mode - instead of doing the 2217 * usual destination + 1 for the second half we get 2218 * destination + 4. 2219 */ 2220 emit(BRW_OPCODE_MOV, 2221 fs_reg(MRF, BRW_MRF_COMPR4 + first_color_mrf + index), color); 2222 } else { 2223 push_force_uncompressed(); 2224 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index), color); 2225 pop_force_uncompressed(); 2226 2227 push_force_sechalf(); 2228 color.sechalf = true; 2229 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index + 4), color); 2230 pop_force_sechalf(); 2231 color.sechalf = false; 2232 } 2233 } 2234} 2235 2236void 2237fs_visitor::emit_fb_writes() 2238{ 2239 this->current_annotation = "FB write header"; 2240 GLboolean header_present = GL_TRUE; 2241 int nr = 0; 2242 int reg_width = c->dispatch_width / 8; 2243 2244 if (intel->gen >= 6 && 2245 !this->kill_emitted && 2246 c->key.nr_color_regions == 1) { 2247 header_present = false; 2248 } 2249 2250 if (header_present) { 2251 /* m0, m1 header */ 2252 nr += 2; 2253 } 2254 2255 if (c->aa_dest_stencil_reg) { 2256 push_force_uncompressed(); 2257 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), 2258 fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0))); 2259 pop_force_uncompressed(); 2260 } 2261 2262 /* Reserve space for color. It'll be filled in per MRT below. */ 2263 int color_mrf = nr; 2264 nr += 4 * reg_width; 2265 2266 if (c->source_depth_to_render_target) { 2267 if (intel->gen == 6 && c->dispatch_width == 16) { 2268 /* For outputting oDepth on gen6, SIMD8 writes have to be 2269 * used. This would require 8-wide moves of each half to 2270 * message regs, kind of like pre-gen5 SIMD16 FB writes. 2271 * Just bail on doing so for now. 2272 */ 2273 fail("Missing support for simd16 depth writes on gen6\n"); 2274 } 2275 2276 if (c->computes_depth) { 2277 /* Hand over gl_FragDepth. */ 2278 assert(this->frag_depth); 2279 fs_reg depth = *(variable_storage(this->frag_depth)); 2280 2281 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), depth); 2282 } else { 2283 /* Pass through the payload depth. */ 2284 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), 2285 fs_reg(brw_vec8_grf(c->source_depth_reg, 0))); 2286 } 2287 nr += reg_width; 2288 } 2289 2290 if (c->dest_depth_reg) { 2291 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), 2292 fs_reg(brw_vec8_grf(c->dest_depth_reg, 0))); 2293 nr += reg_width; 2294 } 2295 2296 fs_reg color = reg_undef; 2297 if (this->frag_color) 2298 color = *(variable_storage(this->frag_color)); 2299 else if (this->frag_data) { 2300 color = *(variable_storage(this->frag_data)); 2301 color.type = BRW_REGISTER_TYPE_F; 2302 } 2303 2304 for (int target = 0; target < c->key.nr_color_regions; target++) { 2305 this->current_annotation = ralloc_asprintf(this->mem_ctx, 2306 "FB write target %d", 2307 target); 2308 if (this->frag_color || this->frag_data) { 2309 for (int i = 0; i < 4; i++) { 2310 emit_color_write(i, color_mrf, color); 2311 color.reg_offset++; 2312 } 2313 } 2314 2315 if (this->frag_color) 2316 color.reg_offset -= 4; 2317 2318 fs_inst *inst = emit(FS_OPCODE_FB_WRITE); 2319 inst->target = target; 2320 inst->base_mrf = 0; 2321 inst->mlen = nr; 2322 if (target == c->key.nr_color_regions - 1) 2323 inst->eot = true; 2324 inst->header_present = header_present; 2325 } 2326 2327 if (c->key.nr_color_regions == 0) { 2328 if (c->key.alpha_test && (this->frag_color || this->frag_data)) { 2329 /* If the alpha test is enabled but there's no color buffer, 2330 * we still need to send alpha out the pipeline to our null 2331 * renderbuffer. 2332 */ 2333 color.reg_offset += 3; 2334 emit_color_write(3, color_mrf, color); 2335 } 2336 2337 fs_inst *inst = emit(FS_OPCODE_FB_WRITE); 2338 inst->base_mrf = 0; 2339 inst->mlen = nr; 2340 inst->eot = true; 2341 inst->header_present = header_present; 2342 } 2343 2344 this->current_annotation = NULL; 2345} 2346 2347void 2348fs_visitor::generate_fb_write(fs_inst *inst) 2349{ 2350 GLboolean eot = inst->eot; 2351 struct brw_reg implied_header; 2352 2353 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied 2354 * move, here's g1. 2355 */ 2356 brw_push_insn_state(p); 2357 brw_set_mask_control(p, BRW_MASK_DISABLE); 2358 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2359 2360 if (inst->header_present) { 2361 if (intel->gen >= 6) { 2362 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 2363 brw_MOV(p, 2364 retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD), 2365 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 2366 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2367 2368 if (inst->target > 0) { 2369 /* Set the render target index for choosing BLEND_STATE. */ 2370 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2), 2371 BRW_REGISTER_TYPE_UD), 2372 brw_imm_ud(inst->target)); 2373 } 2374 2375 implied_header = brw_null_reg(); 2376 } else { 2377 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); 2378 2379 brw_MOV(p, 2380 brw_message_reg(inst->base_mrf + 1), 2381 brw_vec8_grf(1, 0)); 2382 } 2383 } else { 2384 implied_header = brw_null_reg(); 2385 } 2386 2387 brw_pop_insn_state(p); 2388 2389 brw_fb_WRITE(p, 2390 c->dispatch_width, 2391 inst->base_mrf, 2392 implied_header, 2393 inst->target, 2394 inst->mlen, 2395 0, 2396 eot, 2397 inst->header_present); 2398} 2399 2400/* Computes the integer pixel x,y values from the origin. 2401 * 2402 * This is the basis of gl_FragCoord computation, but is also used 2403 * pre-gen6 for computing the deltas from v0 for computing 2404 * interpolation. 2405 */ 2406void 2407fs_visitor::generate_pixel_xy(struct brw_reg dst, bool is_x) 2408{ 2409 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW); 2410 struct brw_reg src; 2411 struct brw_reg deltas; 2412 2413 if (is_x) { 2414 src = stride(suboffset(g1_uw, 4), 2, 4, 0); 2415 deltas = brw_imm_v(0x10101010); 2416 } else { 2417 src = stride(suboffset(g1_uw, 5), 2, 4, 0); 2418 deltas = brw_imm_v(0x11001100); 2419 } 2420 2421 if (c->dispatch_width == 16) { 2422 dst = vec16(dst); 2423 } 2424 2425 /* We do this 8 or 16-wide, but since the destination is UW we 2426 * don't do compression in the 16-wide case. 2427 */ 2428 brw_push_insn_state(p); 2429 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2430 brw_ADD(p, dst, src, deltas); 2431 brw_pop_insn_state(p); 2432} 2433 2434void 2435fs_visitor::generate_linterp(fs_inst *inst, 2436 struct brw_reg dst, struct brw_reg *src) 2437{ 2438 struct brw_reg delta_x = src[0]; 2439 struct brw_reg delta_y = src[1]; 2440 struct brw_reg interp = src[2]; 2441 2442 if (brw->has_pln && 2443 delta_y.nr == delta_x.nr + 1 && 2444 (intel->gen >= 6 || (delta_x.nr & 1) == 0)) { 2445 brw_PLN(p, dst, interp, delta_x); 2446 } else { 2447 brw_LINE(p, brw_null_reg(), interp, delta_x); 2448 brw_MAC(p, dst, suboffset(interp, 1), delta_y); 2449 } 2450} 2451 2452void 2453fs_visitor::generate_math(fs_inst *inst, 2454 struct brw_reg dst, struct brw_reg *src) 2455{ 2456 int op; 2457 2458 switch (inst->opcode) { 2459 case FS_OPCODE_RCP: 2460 op = BRW_MATH_FUNCTION_INV; 2461 break; 2462 case FS_OPCODE_RSQ: 2463 op = BRW_MATH_FUNCTION_RSQ; 2464 break; 2465 case FS_OPCODE_SQRT: 2466 op = BRW_MATH_FUNCTION_SQRT; 2467 break; 2468 case FS_OPCODE_EXP2: 2469 op = BRW_MATH_FUNCTION_EXP; 2470 break; 2471 case FS_OPCODE_LOG2: 2472 op = BRW_MATH_FUNCTION_LOG; 2473 break; 2474 case FS_OPCODE_POW: 2475 op = BRW_MATH_FUNCTION_POW; 2476 break; 2477 case FS_OPCODE_SIN: 2478 op = BRW_MATH_FUNCTION_SIN; 2479 break; 2480 case FS_OPCODE_COS: 2481 op = BRW_MATH_FUNCTION_COS; 2482 break; 2483 default: 2484 assert(!"not reached: unknown math function"); 2485 op = 0; 2486 break; 2487 } 2488 2489 if (intel->gen >= 6) { 2490 assert(inst->mlen == 0); 2491 2492 if (inst->opcode == FS_OPCODE_POW) { 2493 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2494 brw_math2(p, dst, op, src[0], src[1]); 2495 2496 if (c->dispatch_width == 16) { 2497 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); 2498 brw_math2(p, sechalf(dst), op, sechalf(src[0]), sechalf(src[1])); 2499 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 2500 } 2501 } else { 2502 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2503 brw_math(p, dst, 2504 op, 2505 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 2506 BRW_MATH_SATURATE_NONE, 2507 0, src[0], 2508 BRW_MATH_DATA_VECTOR, 2509 BRW_MATH_PRECISION_FULL); 2510 2511 if (c->dispatch_width == 16) { 2512 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); 2513 brw_math(p, sechalf(dst), 2514 op, 2515 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 2516 BRW_MATH_SATURATE_NONE, 2517 0, sechalf(src[0]), 2518 BRW_MATH_DATA_VECTOR, 2519 BRW_MATH_PRECISION_FULL); 2520 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 2521 } 2522 } 2523 } else /* gen <= 5 */{ 2524 assert(inst->mlen >= 1); 2525 2526 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2527 brw_math(p, dst, 2528 op, 2529 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 2530 BRW_MATH_SATURATE_NONE, 2531 inst->base_mrf, src[0], 2532 BRW_MATH_DATA_VECTOR, 2533 BRW_MATH_PRECISION_FULL); 2534 2535 if (c->dispatch_width == 16) { 2536 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); 2537 brw_math(p, sechalf(dst), 2538 op, 2539 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 2540 BRW_MATH_SATURATE_NONE, 2541 inst->base_mrf + 1, sechalf(src[0]), 2542 BRW_MATH_DATA_VECTOR, 2543 BRW_MATH_PRECISION_FULL); 2544 2545 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 2546 } 2547 } 2548} 2549 2550void 2551fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 2552{ 2553 int msg_type = -1; 2554 int rlen = 4; 2555 uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; 2556 2557 if (c->dispatch_width == 16) 2558 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 2559 2560 if (intel->gen >= 5) { 2561 switch (inst->opcode) { 2562 case FS_OPCODE_TEX: 2563 if (inst->shadow_compare) { 2564 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE; 2565 } else { 2566 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE; 2567 } 2568 break; 2569 case FS_OPCODE_TXB: 2570 if (inst->shadow_compare) { 2571 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE; 2572 } else { 2573 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS; 2574 } 2575 break; 2576 case FS_OPCODE_TXL: 2577 if (inst->shadow_compare) { 2578 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE; 2579 } else { 2580 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD; 2581 } 2582 break; 2583 case FS_OPCODE_TXD: 2584 assert(!"TXD isn't supported on gen5+ yet."); 2585 break; 2586 } 2587 } else { 2588 switch (inst->opcode) { 2589 case FS_OPCODE_TEX: 2590 /* Note that G45 and older determines shadow compare and dispatch width 2591 * from message length for most messages. 2592 */ 2593 assert(c->dispatch_width == 8); 2594 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE; 2595 if (inst->shadow_compare) { 2596 assert(inst->mlen == 6); 2597 } else { 2598 assert(inst->mlen <= 4); 2599 } 2600 break; 2601 case FS_OPCODE_TXB: 2602 if (inst->shadow_compare) { 2603 assert(inst->mlen == 6); 2604 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE; 2605 } else { 2606 assert(inst->mlen == 9); 2607 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS; 2608 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 2609 } 2610 break; 2611 case FS_OPCODE_TXL: 2612 if (inst->shadow_compare) { 2613 assert(inst->mlen == 6); 2614 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE; 2615 } else { 2616 assert(inst->mlen == 9); 2617 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD; 2618 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 2619 } 2620 break; 2621 case FS_OPCODE_TXD: 2622 assert(!"TXD isn't supported on gen4 yet."); 2623 break; 2624 } 2625 } 2626 assert(msg_type != -1); 2627 2628 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) { 2629 rlen = 8; 2630 dst = vec16(dst); 2631 } 2632 2633 brw_SAMPLE(p, 2634 retype(dst, BRW_REGISTER_TYPE_UW), 2635 inst->base_mrf, 2636 src, 2637 SURF_INDEX_TEXTURE(inst->sampler), 2638 inst->sampler, 2639 WRITEMASK_XYZW, 2640 msg_type, 2641 rlen, 2642 inst->mlen, 2643 0, 2644 inst->header_present, 2645 simd_mode); 2646} 2647 2648 2649/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input 2650 * looking like: 2651 * 2652 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br 2653 * 2654 * and we're trying to produce: 2655 * 2656 * DDX DDY 2657 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl) 2658 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br) 2659 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl) 2660 * (ss0.br - ss0.bl) (ss0.tr - ss0.br) 2661 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl) 2662 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br) 2663 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl) 2664 * (ss1.br - ss1.bl) (ss1.tr - ss1.br) 2665 * 2666 * and add another set of two more subspans if in 16-pixel dispatch mode. 2667 * 2668 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result 2669 * for each pair, and vertstride = 2 jumps us 2 elements after processing a 2670 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled 2671 * between each other. We could probably do it like ddx and swizzle the right 2672 * order later, but bail for now and just produce 2673 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4) 2674 */ 2675void 2676fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 2677{ 2678 struct brw_reg src0 = brw_reg(src.file, src.nr, 1, 2679 BRW_REGISTER_TYPE_F, 2680 BRW_VERTICAL_STRIDE_2, 2681 BRW_WIDTH_2, 2682 BRW_HORIZONTAL_STRIDE_0, 2683 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2684 struct brw_reg src1 = brw_reg(src.file, src.nr, 0, 2685 BRW_REGISTER_TYPE_F, 2686 BRW_VERTICAL_STRIDE_2, 2687 BRW_WIDTH_2, 2688 BRW_HORIZONTAL_STRIDE_0, 2689 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2690 brw_ADD(p, dst, src0, negate(src1)); 2691} 2692 2693void 2694fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 2695{ 2696 struct brw_reg src0 = brw_reg(src.file, src.nr, 0, 2697 BRW_REGISTER_TYPE_F, 2698 BRW_VERTICAL_STRIDE_4, 2699 BRW_WIDTH_4, 2700 BRW_HORIZONTAL_STRIDE_0, 2701 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2702 struct brw_reg src1 = brw_reg(src.file, src.nr, 2, 2703 BRW_REGISTER_TYPE_F, 2704 BRW_VERTICAL_STRIDE_4, 2705 BRW_WIDTH_4, 2706 BRW_HORIZONTAL_STRIDE_0, 2707 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2708 brw_ADD(p, dst, src0, negate(src1)); 2709} 2710 2711void 2712fs_visitor::generate_discard(fs_inst *inst) 2713{ 2714 struct brw_reg f0 = brw_flag_reg(); 2715 2716 if (intel->gen >= 6) { 2717 struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); 2718 struct brw_reg some_register; 2719 2720 /* As of gen6, we no longer have the mask register to look at, 2721 * so life gets a bit more complicated. 2722 */ 2723 2724 /* Load the flag register with all ones. */ 2725 brw_push_insn_state(p); 2726 brw_set_mask_control(p, BRW_MASK_DISABLE); 2727 brw_MOV(p, f0, brw_imm_uw(0xffff)); 2728 brw_pop_insn_state(p); 2729 2730 /* Do a comparison that should always fail, to produce 0s in the flag 2731 * reg where we have active channels. 2732 */ 2733 some_register = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); 2734 brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), 2735 BRW_CONDITIONAL_NZ, some_register, some_register); 2736 2737 /* Undo CMP's whacking of predication*/ 2738 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 2739 2740 brw_push_insn_state(p); 2741 brw_set_mask_control(p, BRW_MASK_DISABLE); 2742 brw_AND(p, g1, f0, g1); 2743 brw_pop_insn_state(p); 2744 } else { 2745 struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); 2746 2747 brw_push_insn_state(p); 2748 brw_set_mask_control(p, BRW_MASK_DISABLE); 2749 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2750 2751 /* Unlike the 965, we have the mask reg, so we just need 2752 * somewhere to invert that (containing channels to be disabled) 2753 * so it can be ANDed with the mask of pixels still to be 2754 * written. Use the flag reg for consistency with gen6+. 2755 */ 2756 brw_NOT(p, f0, brw_mask_reg(1)); /* IMASK */ 2757 brw_AND(p, g0, f0, g0); 2758 2759 brw_pop_insn_state(p); 2760 } 2761} 2762 2763void 2764fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src) 2765{ 2766 assert(inst->mlen != 0); 2767 2768 brw_MOV(p, 2769 retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD), 2770 retype(src, BRW_REGISTER_TYPE_UD)); 2771 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1, 2772 inst->offset); 2773} 2774 2775void 2776fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst) 2777{ 2778 assert(inst->mlen != 0); 2779 2780 /* Clear any post destination dependencies that would be ignored by 2781 * the block read. See the B-Spec for pre-gen5 send instruction. 2782 * 2783 * This could use a better solution, since texture sampling and 2784 * math reads could potentially run into it as well -- anywhere 2785 * that we have a SEND with a destination that is a register that 2786 * was written but not read within the last N instructions (what's 2787 * N? unsure). This is rare because of dead code elimination, but 2788 * not impossible. 2789 */ 2790 if (intel->gen == 4 && !intel->is_g4x) 2791 brw_MOV(p, brw_null_reg(), dst); 2792 2793 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1, 2794 inst->offset); 2795 2796 if (intel->gen == 4 && !intel->is_g4x) { 2797 /* gen4 errata: destination from a send can't be used as a 2798 * destination until it's been read. Just read it so we don't 2799 * have to worry. 2800 */ 2801 brw_MOV(p, brw_null_reg(), dst); 2802 } 2803} 2804 2805 2806void 2807fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst) 2808{ 2809 assert(inst->mlen != 0); 2810 2811 /* Clear any post destination dependencies that would be ignored by 2812 * the block read. See the B-Spec for pre-gen5 send instruction. 2813 * 2814 * This could use a better solution, since texture sampling and 2815 * math reads could potentially run into it as well -- anywhere 2816 * that we have a SEND with a destination that is a register that 2817 * was written but not read within the last N instructions (what's 2818 * N? unsure). This is rare because of dead code elimination, but 2819 * not impossible. 2820 */ 2821 if (intel->gen == 4 && !intel->is_g4x) 2822 brw_MOV(p, brw_null_reg(), dst); 2823 2824 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf), 2825 inst->offset, SURF_INDEX_FRAG_CONST_BUFFER); 2826 2827 if (intel->gen == 4 && !intel->is_g4x) { 2828 /* gen4 errata: destination from a send can't be used as a 2829 * destination until it's been read. Just read it so we don't 2830 * have to worry. 2831 */ 2832 brw_MOV(p, brw_null_reg(), dst); 2833 } 2834} 2835 2836/** 2837 * To be called after the last _mesa_add_state_reference() call, to 2838 * set up prog_data.param[] for assign_curb_setup() and 2839 * setup_pull_constants(). 2840 */ 2841void 2842fs_visitor::setup_paramvalues_refs() 2843{ 2844 if (c->dispatch_width != 8) 2845 return; 2846 2847 /* Set up the pointers to ParamValues now that that array is finalized. */ 2848 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 2849 c->prog_data.param[i] = 2850 fp->Base.Parameters->ParameterValues[this->param_index[i]] + 2851 this->param_offset[i]; 2852 } 2853} 2854 2855void 2856fs_visitor::assign_curb_setup() 2857{ 2858 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8; 2859 if (c->dispatch_width == 8) { 2860 c->prog_data.first_curbe_grf = c->nr_payload_regs; 2861 } else { 2862 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs; 2863 } 2864 2865 /* Map the offsets in the UNIFORM file to fixed HW regs. */ 2866 foreach_iter(exec_list_iterator, iter, this->instructions) { 2867 fs_inst *inst = (fs_inst *)iter.get(); 2868 2869 for (unsigned int i = 0; i < 3; i++) { 2870 if (inst->src[i].file == UNIFORM) { 2871 int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 2872 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs + 2873 constant_nr / 8, 2874 constant_nr % 8); 2875 2876 inst->src[i].file = FIXED_HW_REG; 2877 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type); 2878 } 2879 } 2880 } 2881} 2882 2883void 2884fs_visitor::calculate_urb_setup() 2885{ 2886 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 2887 urb_setup[i] = -1; 2888 } 2889 2890 int urb_next = 0; 2891 /* Figure out where each of the incoming setup attributes lands. */ 2892 if (intel->gen >= 6) { 2893 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 2894 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) { 2895 urb_setup[i] = urb_next++; 2896 } 2897 } 2898 } else { 2899 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */ 2900 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) { 2901 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) { 2902 int fp_index; 2903 2904 if (i >= VERT_RESULT_VAR0) 2905 fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0); 2906 else if (i <= VERT_RESULT_TEX7) 2907 fp_index = i; 2908 else 2909 fp_index = -1; 2910 2911 if (fp_index >= 0) 2912 urb_setup[fp_index] = urb_next++; 2913 } 2914 } 2915 } 2916 2917 /* Each attribute is 4 setup channels, each of which is half a reg. */ 2918 c->prog_data.urb_read_length = urb_next * 2; 2919} 2920 2921void 2922fs_visitor::assign_urb_setup() 2923{ 2924 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length; 2925 2926 /* Offset all the urb_setup[] index by the actual position of the 2927 * setup regs, now that the location of the constants has been chosen. 2928 */ 2929 foreach_iter(exec_list_iterator, iter, this->instructions) { 2930 fs_inst *inst = (fs_inst *)iter.get(); 2931 2932 if (inst->opcode == FS_OPCODE_LINTERP) { 2933 assert(inst->src[2].file == FIXED_HW_REG); 2934 inst->src[2].fixed_hw_reg.nr += urb_start; 2935 } 2936 2937 if (inst->opcode == FS_OPCODE_CINTERP) { 2938 assert(inst->src[0].file == FIXED_HW_REG); 2939 inst->src[0].fixed_hw_reg.nr += urb_start; 2940 } 2941 } 2942 2943 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length; 2944} 2945 2946/** 2947 * Split large virtual GRFs into separate components if we can. 2948 * 2949 * This is mostly duplicated with what brw_fs_vector_splitting does, 2950 * but that's really conservative because it's afraid of doing 2951 * splitting that doesn't result in real progress after the rest of 2952 * the optimization phases, which would cause infinite looping in 2953 * optimization. We can do it once here, safely. This also has the 2954 * opportunity to split interpolated values, or maybe even uniforms, 2955 * which we don't have at the IR level. 2956 * 2957 * We want to split, because virtual GRFs are what we register 2958 * allocate and spill (due to contiguousness requirements for some 2959 * instructions), and they're what we naturally generate in the 2960 * codegen process, but most virtual GRFs don't actually need to be 2961 * contiguous sets of GRFs. If we split, we'll end up with reduced 2962 * live intervals and better dead code elimination and coalescing. 2963 */ 2964void 2965fs_visitor::split_virtual_grfs() 2966{ 2967 int num_vars = this->virtual_grf_next; 2968 bool split_grf[num_vars]; 2969 int new_virtual_grf[num_vars]; 2970 2971 /* Try to split anything > 0 sized. */ 2972 for (int i = 0; i < num_vars; i++) { 2973 if (this->virtual_grf_sizes[i] != 1) 2974 split_grf[i] = true; 2975 else 2976 split_grf[i] = false; 2977 } 2978 2979 if (brw->has_pln) { 2980 /* PLN opcodes rely on the delta_xy being contiguous. */ 2981 split_grf[this->delta_x.reg] = false; 2982 } 2983 2984 foreach_iter(exec_list_iterator, iter, this->instructions) { 2985 fs_inst *inst = (fs_inst *)iter.get(); 2986 2987 /* Texturing produces 4 contiguous registers, so no splitting. */ 2988 if (inst->is_tex()) { 2989 split_grf[inst->dst.reg] = false; 2990 } 2991 } 2992 2993 /* Allocate new space for split regs. Note that the virtual 2994 * numbers will be contiguous. 2995 */ 2996 for (int i = 0; i < num_vars; i++) { 2997 if (split_grf[i]) { 2998 new_virtual_grf[i] = virtual_grf_alloc(1); 2999 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) { 3000 int reg = virtual_grf_alloc(1); 3001 assert(reg == new_virtual_grf[i] + j - 1); 3002 (void) reg; 3003 } 3004 this->virtual_grf_sizes[i] = 1; 3005 } 3006 } 3007 3008 foreach_iter(exec_list_iterator, iter, this->instructions) { 3009 fs_inst *inst = (fs_inst *)iter.get(); 3010 3011 if (inst->dst.file == GRF && 3012 split_grf[inst->dst.reg] && 3013 inst->dst.reg_offset != 0) { 3014 inst->dst.reg = (new_virtual_grf[inst->dst.reg] + 3015 inst->dst.reg_offset - 1); 3016 inst->dst.reg_offset = 0; 3017 } 3018 for (int i = 0; i < 3; i++) { 3019 if (inst->src[i].file == GRF && 3020 split_grf[inst->src[i].reg] && 3021 inst->src[i].reg_offset != 0) { 3022 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] + 3023 inst->src[i].reg_offset - 1); 3024 inst->src[i].reg_offset = 0; 3025 } 3026 } 3027 } 3028 this->live_intervals_valid = false; 3029} 3030 3031/** 3032 * Choose accesses from the UNIFORM file to demote to using the pull 3033 * constant buffer. 3034 * 3035 * We allow a fragment shader to have more than the specified minimum 3036 * maximum number of fragment shader uniform components (64). If 3037 * there are too many of these, they'd fill up all of register space. 3038 * So, this will push some of them out to the pull constant buffer and 3039 * update the program to load them. 3040 */ 3041void 3042fs_visitor::setup_pull_constants() 3043{ 3044 /* Only allow 16 registers (128 uniform components) as push constants. */ 3045 unsigned int max_uniform_components = 16 * 8; 3046 if (c->prog_data.nr_params <= max_uniform_components) 3047 return; 3048 3049 if (c->dispatch_width == 16) { 3050 fail("Pull constants not supported in 16-wide\n"); 3051 return; 3052 } 3053 3054 /* Just demote the end of the list. We could probably do better 3055 * here, demoting things that are rarely used in the program first. 3056 */ 3057 int pull_uniform_base = max_uniform_components; 3058 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base; 3059 3060 foreach_iter(exec_list_iterator, iter, this->instructions) { 3061 fs_inst *inst = (fs_inst *)iter.get(); 3062 3063 for (int i = 0; i < 3; i++) { 3064 if (inst->src[i].file != UNIFORM) 3065 continue; 3066 3067 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 3068 if (uniform_nr < pull_uniform_base) 3069 continue; 3070 3071 fs_reg dst = fs_reg(this, glsl_type::float_type); 3072 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD, 3073 dst); 3074 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15; 3075 pull->ir = inst->ir; 3076 pull->annotation = inst->annotation; 3077 pull->base_mrf = 14; 3078 pull->mlen = 1; 3079 3080 inst->insert_before(pull); 3081 3082 inst->src[i].file = GRF; 3083 inst->src[i].reg = dst.reg; 3084 inst->src[i].reg_offset = 0; 3085 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3; 3086 } 3087 } 3088 3089 for (int i = 0; i < pull_uniform_count; i++) { 3090 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i]; 3091 c->prog_data.pull_param_convert[i] = 3092 c->prog_data.param_convert[pull_uniform_base + i]; 3093 } 3094 c->prog_data.nr_params -= pull_uniform_count; 3095 c->prog_data.nr_pull_params = pull_uniform_count; 3096} 3097 3098void 3099fs_visitor::calculate_live_intervals() 3100{ 3101 int num_vars = this->virtual_grf_next; 3102 int *def = ralloc_array(mem_ctx, int, num_vars); 3103 int *use = ralloc_array(mem_ctx, int, num_vars); 3104 int loop_depth = 0; 3105 int loop_start = 0; 3106 3107 if (this->live_intervals_valid) 3108 return; 3109 3110 for (int i = 0; i < num_vars; i++) { 3111 def[i] = MAX_INSTRUCTION; 3112 use[i] = -1; 3113 } 3114 3115 int ip = 0; 3116 foreach_iter(exec_list_iterator, iter, this->instructions) { 3117 fs_inst *inst = (fs_inst *)iter.get(); 3118 3119 if (inst->opcode == BRW_OPCODE_DO) { 3120 if (loop_depth++ == 0) 3121 loop_start = ip; 3122 } else if (inst->opcode == BRW_OPCODE_WHILE) { 3123 loop_depth--; 3124 3125 if (loop_depth == 0) { 3126 /* Patches up the use of vars marked for being live across 3127 * the whole loop. 3128 */ 3129 for (int i = 0; i < num_vars; i++) { 3130 if (use[i] == loop_start) { 3131 use[i] = ip; 3132 } 3133 } 3134 } 3135 } else { 3136 for (unsigned int i = 0; i < 3; i++) { 3137 if (inst->src[i].file == GRF && inst->src[i].reg != 0) { 3138 int reg = inst->src[i].reg; 3139 3140 if (!loop_depth) { 3141 use[reg] = ip; 3142 } else { 3143 def[reg] = MIN2(loop_start, def[reg]); 3144 use[reg] = loop_start; 3145 3146 /* Nobody else is going to go smash our start to 3147 * later in the loop now, because def[reg] now 3148 * points before the bb header. 3149 */ 3150 } 3151 } 3152 } 3153 if (inst->dst.file == GRF && inst->dst.reg != 0) { 3154 int reg = inst->dst.reg; 3155 3156 if (!loop_depth) { 3157 def[reg] = MIN2(def[reg], ip); 3158 } else { 3159 def[reg] = MIN2(def[reg], loop_start); 3160 } 3161 } 3162 } 3163 3164 ip++; 3165 } 3166 3167 ralloc_free(this->virtual_grf_def); 3168 ralloc_free(this->virtual_grf_use); 3169 this->virtual_grf_def = def; 3170 this->virtual_grf_use = use; 3171 3172 this->live_intervals_valid = true; 3173} 3174 3175/** 3176 * Attempts to move immediate constants into the immediate 3177 * constant slot of following instructions. 3178 * 3179 * Immediate constants are a bit tricky -- they have to be in the last 3180 * operand slot, you can't do abs/negate on them, 3181 */ 3182 3183bool 3184fs_visitor::propagate_constants() 3185{ 3186 bool progress = false; 3187 3188 calculate_live_intervals(); 3189 3190 foreach_iter(exec_list_iterator, iter, this->instructions) { 3191 fs_inst *inst = (fs_inst *)iter.get(); 3192 3193 if (inst->opcode != BRW_OPCODE_MOV || 3194 inst->predicated || 3195 inst->dst.file != GRF || inst->src[0].file != IMM || 3196 inst->dst.type != inst->src[0].type || 3197 (c->dispatch_width == 16 && 3198 (inst->force_uncompressed || inst->force_sechalf))) 3199 continue; 3200 3201 /* Don't bother with cases where we should have had the 3202 * operation on the constant folded in GLSL already. 3203 */ 3204 if (inst->saturate) 3205 continue; 3206 3207 /* Found a move of a constant to a GRF. Find anything else using the GRF 3208 * before it's written, and replace it with the constant if we can. 3209 */ 3210 exec_list_iterator scan_iter = iter; 3211 scan_iter.next(); 3212 for (; scan_iter.has_next(); scan_iter.next()) { 3213 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 3214 3215 if (scan_inst->opcode == BRW_OPCODE_DO || 3216 scan_inst->opcode == BRW_OPCODE_WHILE || 3217 scan_inst->opcode == BRW_OPCODE_ELSE || 3218 scan_inst->opcode == BRW_OPCODE_ENDIF) { 3219 break; 3220 } 3221 3222 for (int i = 2; i >= 0; i--) { 3223 if (scan_inst->src[i].file != GRF || 3224 scan_inst->src[i].reg != inst->dst.reg || 3225 scan_inst->src[i].reg_offset != inst->dst.reg_offset) 3226 continue; 3227 3228 /* Don't bother with cases where we should have had the 3229 * operation on the constant folded in GLSL already. 3230 */ 3231 if (scan_inst->src[i].negate || scan_inst->src[i].abs) 3232 continue; 3233 3234 switch (scan_inst->opcode) { 3235 case BRW_OPCODE_MOV: 3236 scan_inst->src[i] = inst->src[0]; 3237 progress = true; 3238 break; 3239 3240 case BRW_OPCODE_MUL: 3241 case BRW_OPCODE_ADD: 3242 if (i == 1) { 3243 scan_inst->src[i] = inst->src[0]; 3244 progress = true; 3245 } else if (i == 0 && scan_inst->src[1].file != IMM) { 3246 /* Fit this constant in by commuting the operands */ 3247 scan_inst->src[0] = scan_inst->src[1]; 3248 scan_inst->src[1] = inst->src[0]; 3249 progress = true; 3250 } 3251 break; 3252 3253 case BRW_OPCODE_CMP: 3254 if (i == 1) { 3255 scan_inst->src[i] = inst->src[0]; 3256 progress = true; 3257 } else if (i == 0 && scan_inst->src[1].file != IMM) { 3258 uint32_t new_cmod; 3259 3260 new_cmod = brw_swap_cmod(scan_inst->conditional_mod); 3261 if (new_cmod != ~0u) { 3262 /* Fit this constant in by swapping the operands and 3263 * flipping the test 3264 */ 3265 scan_inst->src[0] = scan_inst->src[1]; 3266 scan_inst->src[1] = inst->src[0]; 3267 scan_inst->conditional_mod = new_cmod; 3268 progress = true; 3269 } 3270 } 3271 break; 3272 3273 case BRW_OPCODE_SEL: 3274 if (i == 1) { 3275 scan_inst->src[i] = inst->src[0]; 3276 progress = true; 3277 } else if (i == 0 && scan_inst->src[1].file != IMM) { 3278 /* Fit this constant in by swapping the operands and 3279 * flipping the predicate 3280 */ 3281 scan_inst->src[0] = scan_inst->src[1]; 3282 scan_inst->src[1] = inst->src[0]; 3283 scan_inst->predicate_inverse = !scan_inst->predicate_inverse; 3284 progress = true; 3285 } 3286 break; 3287 } 3288 } 3289 3290 if (scan_inst->dst.file == GRF && 3291 scan_inst->dst.reg == inst->dst.reg && 3292 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 3293 scan_inst->is_tex())) { 3294 break; 3295 } 3296 } 3297 } 3298 3299 if (progress) 3300 this->live_intervals_valid = false; 3301 3302 return progress; 3303} 3304/** 3305 * Must be called after calculate_live_intervales() to remove unused 3306 * writes to registers -- register allocation will fail otherwise 3307 * because something deffed but not used won't be considered to 3308 * interfere with other regs. 3309 */ 3310bool 3311fs_visitor::dead_code_eliminate() 3312{ 3313 bool progress = false; 3314 int pc = 0; 3315 3316 calculate_live_intervals(); 3317 3318 foreach_iter(exec_list_iterator, iter, this->instructions) { 3319 fs_inst *inst = (fs_inst *)iter.get(); 3320 3321 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) { 3322 inst->remove(); 3323 progress = true; 3324 } 3325 3326 pc++; 3327 } 3328 3329 if (progress) 3330 live_intervals_valid = false; 3331 3332 return progress; 3333} 3334 3335bool 3336fs_visitor::register_coalesce() 3337{ 3338 bool progress = false; 3339 int if_depth = 0; 3340 int loop_depth = 0; 3341 3342 foreach_iter(exec_list_iterator, iter, this->instructions) { 3343 fs_inst *inst = (fs_inst *)iter.get(); 3344 3345 /* Make sure that we dominate the instructions we're going to 3346 * scan for interfering with our coalescing, or we won't have 3347 * scanned enough to see if anything interferes with our 3348 * coalescing. We don't dominate the following instructions if 3349 * we're in a loop or an if block. 3350 */ 3351 switch (inst->opcode) { 3352 case BRW_OPCODE_DO: 3353 loop_depth++; 3354 break; 3355 case BRW_OPCODE_WHILE: 3356 loop_depth--; 3357 break; 3358 case BRW_OPCODE_IF: 3359 if_depth++; 3360 break; 3361 case BRW_OPCODE_ENDIF: 3362 if_depth--; 3363 break; 3364 } 3365 if (loop_depth || if_depth) 3366 continue; 3367 3368 if (inst->opcode != BRW_OPCODE_MOV || 3369 inst->predicated || 3370 inst->saturate || 3371 inst->dst.file != GRF || inst->src[0].file != GRF || 3372 inst->dst.type != inst->src[0].type) 3373 continue; 3374 3375 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate; 3376 3377 /* Found a move of a GRF to a GRF. Let's see if we can coalesce 3378 * them: check for no writes to either one until the exit of the 3379 * program. 3380 */ 3381 bool interfered = false; 3382 exec_list_iterator scan_iter = iter; 3383 scan_iter.next(); 3384 for (; scan_iter.has_next(); scan_iter.next()) { 3385 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 3386 3387 if (scan_inst->dst.file == GRF) { 3388 if (scan_inst->dst.reg == inst->dst.reg && 3389 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 3390 scan_inst->is_tex())) { 3391 interfered = true; 3392 break; 3393 } 3394 if (scan_inst->dst.reg == inst->src[0].reg && 3395 (scan_inst->dst.reg_offset == inst->src[0].reg_offset || 3396 scan_inst->is_tex())) { 3397 interfered = true; 3398 break; 3399 } 3400 } 3401 3402 /* The gen6 MATH instruction can't handle source modifiers, so avoid 3403 * coalescing those for now. We should do something more specific. 3404 */ 3405 if (intel->gen >= 6 && scan_inst->is_math() && has_source_modifiers) { 3406 interfered = true; 3407 break; 3408 } 3409 } 3410 if (interfered) { 3411 continue; 3412 } 3413 3414 /* Rewrite the later usage to point at the source of the move to 3415 * be removed. 3416 */ 3417 for (exec_list_iterator scan_iter = iter; scan_iter.has_next(); 3418 scan_iter.next()) { 3419 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 3420 3421 for (int i = 0; i < 3; i++) { 3422 if (scan_inst->src[i].file == GRF && 3423 scan_inst->src[i].reg == inst->dst.reg && 3424 scan_inst->src[i].reg_offset == inst->dst.reg_offset) { 3425 scan_inst->src[i].reg = inst->src[0].reg; 3426 scan_inst->src[i].reg_offset = inst->src[0].reg_offset; 3427 scan_inst->src[i].abs |= inst->src[0].abs; 3428 scan_inst->src[i].negate ^= inst->src[0].negate; 3429 scan_inst->src[i].smear = inst->src[0].smear; 3430 } 3431 } 3432 } 3433 3434 inst->remove(); 3435 progress = true; 3436 } 3437 3438 if (progress) 3439 live_intervals_valid = false; 3440 3441 return progress; 3442} 3443 3444 3445bool 3446fs_visitor::compute_to_mrf() 3447{ 3448 bool progress = false; 3449 int next_ip = 0; 3450 3451 calculate_live_intervals(); 3452 3453 foreach_iter(exec_list_iterator, iter, this->instructions) { 3454 fs_inst *inst = (fs_inst *)iter.get(); 3455 3456 int ip = next_ip; 3457 next_ip++; 3458 3459 if (inst->opcode != BRW_OPCODE_MOV || 3460 inst->predicated || 3461 inst->dst.file != MRF || inst->src[0].file != GRF || 3462 inst->dst.type != inst->src[0].type || 3463 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1) 3464 continue; 3465 3466 /* Work out which hardware MRF registers are written by this 3467 * instruction. 3468 */ 3469 int mrf_low = inst->dst.hw_reg & ~BRW_MRF_COMPR4; 3470 int mrf_high; 3471 if (inst->dst.hw_reg & BRW_MRF_COMPR4) { 3472 mrf_high = mrf_low + 4; 3473 } else if (c->dispatch_width == 16 && 3474 (!inst->force_uncompressed && !inst->force_sechalf)) { 3475 mrf_high = mrf_low + 1; 3476 } else { 3477 mrf_high = mrf_low; 3478 } 3479 3480 /* Can't compute-to-MRF this GRF if someone else was going to 3481 * read it later. 3482 */ 3483 if (this->virtual_grf_use[inst->src[0].reg] > ip) 3484 continue; 3485 3486 /* Found a move of a GRF to a MRF. Let's see if we can go 3487 * rewrite the thing that made this GRF to write into the MRF. 3488 */ 3489 fs_inst *scan_inst; 3490 for (scan_inst = (fs_inst *)inst->prev; 3491 scan_inst->prev != NULL; 3492 scan_inst = (fs_inst *)scan_inst->prev) { 3493 if (scan_inst->dst.file == GRF && 3494 scan_inst->dst.reg == inst->src[0].reg) { 3495 /* Found the last thing to write our reg we want to turn 3496 * into a compute-to-MRF. 3497 */ 3498 3499 if (scan_inst->is_tex()) { 3500 /* texturing writes several continuous regs, so we can't 3501 * compute-to-mrf that. 3502 */ 3503 break; 3504 } 3505 3506 /* If it's predicated, it (probably) didn't populate all 3507 * the channels. We might be able to rewrite everything 3508 * that writes that reg, but it would require smarter 3509 * tracking to delay the rewriting until complete success. 3510 */ 3511 if (scan_inst->predicated) 3512 break; 3513 3514 /* If it's half of register setup and not the same half as 3515 * our MOV we're trying to remove, bail for now. 3516 */ 3517 if (scan_inst->force_uncompressed != inst->force_uncompressed || 3518 scan_inst->force_sechalf != inst->force_sechalf) { 3519 break; 3520 } 3521 3522 /* SEND instructions can't have MRF as a destination. */ 3523 if (scan_inst->mlen) 3524 break; 3525 3526 if (intel->gen >= 6) { 3527 /* gen6 math instructions must have the destination be 3528 * GRF, so no compute-to-MRF for them. 3529 */ 3530 if (scan_inst->is_math()) { 3531 break; 3532 } 3533 } 3534 3535 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 3536 /* Found the creator of our MRF's source value. */ 3537 scan_inst->dst.file = MRF; 3538 scan_inst->dst.hw_reg = inst->dst.hw_reg; 3539 scan_inst->saturate |= inst->saturate; 3540 inst->remove(); 3541 progress = true; 3542 } 3543 break; 3544 } 3545 3546 /* We don't handle flow control here. Most computation of 3547 * values that end up in MRFs are shortly before the MRF 3548 * write anyway. 3549 */ 3550 if (scan_inst->opcode == BRW_OPCODE_DO || 3551 scan_inst->opcode == BRW_OPCODE_WHILE || 3552 scan_inst->opcode == BRW_OPCODE_ELSE || 3553 scan_inst->opcode == BRW_OPCODE_ENDIF) { 3554 break; 3555 } 3556 3557 /* You can't read from an MRF, so if someone else reads our 3558 * MRF's source GRF that we wanted to rewrite, that stops us. 3559 */ 3560 bool interfered = false; 3561 for (int i = 0; i < 3; i++) { 3562 if (scan_inst->src[i].file == GRF && 3563 scan_inst->src[i].reg == inst->src[0].reg && 3564 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { 3565 interfered = true; 3566 } 3567 } 3568 if (interfered) 3569 break; 3570 3571 if (scan_inst->dst.file == MRF) { 3572 /* If somebody else writes our MRF here, we can't 3573 * compute-to-MRF before that. 3574 */ 3575 int scan_mrf_low = scan_inst->dst.hw_reg & ~BRW_MRF_COMPR4; 3576 int scan_mrf_high; 3577 3578 if (scan_inst->dst.hw_reg & BRW_MRF_COMPR4) { 3579 scan_mrf_high = scan_mrf_low + 4; 3580 } else if (c->dispatch_width == 16 && 3581 (!scan_inst->force_uncompressed && 3582 !scan_inst->force_sechalf)) { 3583 scan_mrf_high = scan_mrf_low + 1; 3584 } else { 3585 scan_mrf_high = scan_mrf_low; 3586 } 3587 3588 if (mrf_low == scan_mrf_low || 3589 mrf_low == scan_mrf_high || 3590 mrf_high == scan_mrf_low || 3591 mrf_high == scan_mrf_high) { 3592 break; 3593 } 3594 } 3595 3596 if (scan_inst->mlen > 0) { 3597 /* Found a SEND instruction, which means that there are 3598 * live values in MRFs from base_mrf to base_mrf + 3599 * scan_inst->mlen - 1. Don't go pushing our MRF write up 3600 * above it. 3601 */ 3602 if (mrf_low >= scan_inst->base_mrf && 3603 mrf_low < scan_inst->base_mrf + scan_inst->mlen) { 3604 break; 3605 } 3606 if (mrf_high >= scan_inst->base_mrf && 3607 mrf_high < scan_inst->base_mrf + scan_inst->mlen) { 3608 break; 3609 } 3610 } 3611 } 3612 } 3613 3614 return progress; 3615} 3616 3617/** 3618 * Walks through basic blocks, locking for repeated MRF writes and 3619 * removing the later ones. 3620 */ 3621bool 3622fs_visitor::remove_duplicate_mrf_writes() 3623{ 3624 fs_inst *last_mrf_move[16]; 3625 bool progress = false; 3626 3627 /* Need to update the MRF tracking for compressed instructions. */ 3628 if (c->dispatch_width == 16) 3629 return false; 3630 3631 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 3632 3633 foreach_iter(exec_list_iterator, iter, this->instructions) { 3634 fs_inst *inst = (fs_inst *)iter.get(); 3635 3636 switch (inst->opcode) { 3637 case BRW_OPCODE_DO: 3638 case BRW_OPCODE_WHILE: 3639 case BRW_OPCODE_IF: 3640 case BRW_OPCODE_ELSE: 3641 case BRW_OPCODE_ENDIF: 3642 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 3643 continue; 3644 default: 3645 break; 3646 } 3647 3648 if (inst->opcode == BRW_OPCODE_MOV && 3649 inst->dst.file == MRF) { 3650 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg]; 3651 if (prev_inst && inst->equals(prev_inst)) { 3652 inst->remove(); 3653 progress = true; 3654 continue; 3655 } 3656 } 3657 3658 /* Clear out the last-write records for MRFs that were overwritten. */ 3659 if (inst->dst.file == MRF) { 3660 last_mrf_move[inst->dst.hw_reg] = NULL; 3661 } 3662 3663 if (inst->mlen > 0) { 3664 /* Found a SEND instruction, which will include two or fewer 3665 * implied MRF writes. We could do better here. 3666 */ 3667 for (int i = 0; i < implied_mrf_writes(inst); i++) { 3668 last_mrf_move[inst->base_mrf + i] = NULL; 3669 } 3670 } 3671 3672 /* Clear out any MRF move records whose sources got overwritten. */ 3673 if (inst->dst.file == GRF) { 3674 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) { 3675 if (last_mrf_move[i] && 3676 last_mrf_move[i]->src[0].reg == inst->dst.reg) { 3677 last_mrf_move[i] = NULL; 3678 } 3679 } 3680 } 3681 3682 if (inst->opcode == BRW_OPCODE_MOV && 3683 inst->dst.file == MRF && 3684 inst->src[0].file == GRF && 3685 !inst->predicated) { 3686 last_mrf_move[inst->dst.hw_reg] = inst; 3687 } 3688 } 3689 3690 return progress; 3691} 3692 3693bool 3694fs_visitor::virtual_grf_interferes(int a, int b) 3695{ 3696 int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]); 3697 int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]); 3698 3699 /* We can't handle dead register writes here, without iterating 3700 * over the whole instruction stream to find every single dead 3701 * write to that register to compare to the live interval of the 3702 * other register. Just assert that dead_code_eliminate() has been 3703 * called. 3704 */ 3705 assert((this->virtual_grf_use[a] != -1 || 3706 this->virtual_grf_def[a] == MAX_INSTRUCTION) && 3707 (this->virtual_grf_use[b] != -1 || 3708 this->virtual_grf_def[b] == MAX_INSTRUCTION)); 3709 3710 /* If the register is used to store 16 values of less than float 3711 * size (only the case for pixel_[xy]), then we can't allocate 3712 * another dword-sized thing to that register that would be used in 3713 * the same instruction. This is because when the GPU decodes (for 3714 * example): 3715 * 3716 * (declare (in ) vec4 gl_FragCoord@0x97766a0) 3717 * add(16) g6<1>F g6<8,8,1>UW 0.5F { align1 compr }; 3718 * 3719 * it's actually processed as: 3720 * add(8) g6<1>F g6<8,8,1>UW 0.5F { align1 }; 3721 * add(8) g7<1>F g6.8<8,8,1>UW 0.5F { align1 sechalf }; 3722 * 3723 * so our second half values in g6 got overwritten in the first 3724 * half. 3725 */ 3726 if (c->dispatch_width == 16 && (this->pixel_x.reg == a || 3727 this->pixel_x.reg == b || 3728 this->pixel_y.reg == a || 3729 this->pixel_y.reg == b)) { 3730 return start <= end; 3731 } 3732 3733 return start < end; 3734} 3735 3736static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg) 3737{ 3738 struct brw_reg brw_reg; 3739 3740 switch (reg->file) { 3741 case GRF: 3742 case ARF: 3743 case MRF: 3744 if (reg->smear == -1) { 3745 brw_reg = brw_vec8_reg(reg->file, 3746 reg->hw_reg, 0); 3747 } else { 3748 brw_reg = brw_vec1_reg(reg->file, 3749 reg->hw_reg, reg->smear); 3750 } 3751 brw_reg = retype(brw_reg, reg->type); 3752 if (reg->sechalf) 3753 brw_reg = sechalf(brw_reg); 3754 break; 3755 case IMM: 3756 switch (reg->type) { 3757 case BRW_REGISTER_TYPE_F: 3758 brw_reg = brw_imm_f(reg->imm.f); 3759 break; 3760 case BRW_REGISTER_TYPE_D: 3761 brw_reg = brw_imm_d(reg->imm.i); 3762 break; 3763 case BRW_REGISTER_TYPE_UD: 3764 brw_reg = brw_imm_ud(reg->imm.u); 3765 break; 3766 default: 3767 assert(!"not reached"); 3768 brw_reg = brw_null_reg(); 3769 break; 3770 } 3771 break; 3772 case FIXED_HW_REG: 3773 brw_reg = reg->fixed_hw_reg; 3774 break; 3775 case BAD_FILE: 3776 /* Probably unused. */ 3777 brw_reg = brw_null_reg(); 3778 break; 3779 case UNIFORM: 3780 assert(!"not reached"); 3781 brw_reg = brw_null_reg(); 3782 break; 3783 default: 3784 assert(!"not reached"); 3785 brw_reg = brw_null_reg(); 3786 break; 3787 } 3788 if (reg->abs) 3789 brw_reg = brw_abs(brw_reg); 3790 if (reg->negate) 3791 brw_reg = negate(brw_reg); 3792 3793 return brw_reg; 3794} 3795 3796void 3797fs_visitor::generate_code() 3798{ 3799 int last_native_inst = p->nr_insn; 3800 const char *last_annotation_string = NULL; 3801 ir_instruction *last_annotation_ir = NULL; 3802 3803 int loop_stack_array_size = 16; 3804 int loop_stack_depth = 0; 3805 brw_instruction **loop_stack = 3806 rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size); 3807 int *if_depth_in_loop = 3808 rzalloc_array(this->mem_ctx, int, loop_stack_array_size); 3809 3810 3811 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3812 printf("Native code for fragment shader %d (%d-wide dispatch):\n", 3813 ctx->Shader.CurrentFragmentProgram->Name, c->dispatch_width); 3814 } 3815 3816 foreach_iter(exec_list_iterator, iter, this->instructions) { 3817 fs_inst *inst = (fs_inst *)iter.get(); 3818 struct brw_reg src[3], dst; 3819 3820 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3821 if (last_annotation_ir != inst->ir) { 3822 last_annotation_ir = inst->ir; 3823 if (last_annotation_ir) { 3824 printf(" "); 3825 last_annotation_ir->print(); 3826 printf("\n"); 3827 } 3828 } 3829 if (last_annotation_string != inst->annotation) { 3830 last_annotation_string = inst->annotation; 3831 if (last_annotation_string) 3832 printf(" %s\n", last_annotation_string); 3833 } 3834 } 3835 3836 for (unsigned int i = 0; i < 3; i++) { 3837 src[i] = brw_reg_from_fs_reg(&inst->src[i]); 3838 } 3839 dst = brw_reg_from_fs_reg(&inst->dst); 3840 3841 brw_set_conditionalmod(p, inst->conditional_mod); 3842 brw_set_predicate_control(p, inst->predicated); 3843 brw_set_predicate_inverse(p, inst->predicate_inverse); 3844 brw_set_saturate(p, inst->saturate); 3845 3846 if (inst->force_uncompressed || c->dispatch_width == 8) { 3847 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 3848 } else if (inst->force_sechalf) { 3849 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); 3850 } else { 3851 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 3852 } 3853 3854 switch (inst->opcode) { 3855 case BRW_OPCODE_MOV: 3856 brw_MOV(p, dst, src[0]); 3857 break; 3858 case BRW_OPCODE_ADD: 3859 brw_ADD(p, dst, src[0], src[1]); 3860 break; 3861 case BRW_OPCODE_MUL: 3862 brw_MUL(p, dst, src[0], src[1]); 3863 break; 3864 3865 case BRW_OPCODE_FRC: 3866 brw_FRC(p, dst, src[0]); 3867 break; 3868 case BRW_OPCODE_RNDD: 3869 brw_RNDD(p, dst, src[0]); 3870 break; 3871 case BRW_OPCODE_RNDE: 3872 brw_RNDE(p, dst, src[0]); 3873 break; 3874 case BRW_OPCODE_RNDZ: 3875 brw_RNDZ(p, dst, src[0]); 3876 break; 3877 3878 case BRW_OPCODE_AND: 3879 brw_AND(p, dst, src[0], src[1]); 3880 break; 3881 case BRW_OPCODE_OR: 3882 brw_OR(p, dst, src[0], src[1]); 3883 break; 3884 case BRW_OPCODE_XOR: 3885 brw_XOR(p, dst, src[0], src[1]); 3886 break; 3887 case BRW_OPCODE_NOT: 3888 brw_NOT(p, dst, src[0]); 3889 break; 3890 case BRW_OPCODE_ASR: 3891 brw_ASR(p, dst, src[0], src[1]); 3892 break; 3893 case BRW_OPCODE_SHR: 3894 brw_SHR(p, dst, src[0], src[1]); 3895 break; 3896 case BRW_OPCODE_SHL: 3897 brw_SHL(p, dst, src[0], src[1]); 3898 break; 3899 3900 case BRW_OPCODE_CMP: 3901 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); 3902 break; 3903 case BRW_OPCODE_SEL: 3904 brw_SEL(p, dst, src[0], src[1]); 3905 break; 3906 3907 case BRW_OPCODE_IF: 3908 if (inst->src[0].file != BAD_FILE) { 3909 /* The instruction has an embedded compare (only allowed on gen6) */ 3910 assert(intel->gen == 6); 3911 gen6_IF(p, inst->conditional_mod, src[0], src[1]); 3912 } else { 3913 brw_IF(p, c->dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8); 3914 } 3915 if_depth_in_loop[loop_stack_depth]++; 3916 break; 3917 3918 case BRW_OPCODE_ELSE: 3919 brw_ELSE(p); 3920 break; 3921 case BRW_OPCODE_ENDIF: 3922 brw_ENDIF(p); 3923 if_depth_in_loop[loop_stack_depth]--; 3924 break; 3925 3926 case BRW_OPCODE_DO: 3927 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8); 3928 if (loop_stack_array_size <= loop_stack_depth) { 3929 loop_stack_array_size *= 2; 3930 loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *, 3931 loop_stack_array_size); 3932 if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int, 3933 loop_stack_array_size); 3934 } 3935 if_depth_in_loop[loop_stack_depth] = 0; 3936 break; 3937 3938 case BRW_OPCODE_BREAK: 3939 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]); 3940 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 3941 break; 3942 case BRW_OPCODE_CONTINUE: 3943 /* FINISHME: We need to write the loop instruction support still. */ 3944 if (intel->gen >= 6) 3945 gen6_CONT(p, loop_stack[loop_stack_depth - 1]); 3946 else 3947 brw_CONT(p, if_depth_in_loop[loop_stack_depth]); 3948 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 3949 break; 3950 3951 case BRW_OPCODE_WHILE: { 3952 struct brw_instruction *inst0, *inst1; 3953 GLuint br = 1; 3954 3955 if (intel->gen >= 5) 3956 br = 2; 3957 3958 assert(loop_stack_depth > 0); 3959 loop_stack_depth--; 3960 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]); 3961 if (intel->gen < 6) { 3962 /* patch all the BREAK/CONT instructions from last BGNLOOP */ 3963 while (inst0 > loop_stack[loop_stack_depth]) { 3964 inst0--; 3965 if (inst0->header.opcode == BRW_OPCODE_BREAK && 3966 inst0->bits3.if_else.jump_count == 0) { 3967 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1); 3968 } 3969 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE && 3970 inst0->bits3.if_else.jump_count == 0) { 3971 inst0->bits3.if_else.jump_count = br * (inst1 - inst0); 3972 } 3973 } 3974 } 3975 } 3976 break; 3977 3978 case FS_OPCODE_RCP: 3979 case FS_OPCODE_RSQ: 3980 case FS_OPCODE_SQRT: 3981 case FS_OPCODE_EXP2: 3982 case FS_OPCODE_LOG2: 3983 case FS_OPCODE_POW: 3984 case FS_OPCODE_SIN: 3985 case FS_OPCODE_COS: 3986 generate_math(inst, dst, src); 3987 break; 3988 case FS_OPCODE_PIXEL_X: 3989 generate_pixel_xy(dst, true); 3990 break; 3991 case FS_OPCODE_PIXEL_Y: 3992 generate_pixel_xy(dst, false); 3993 break; 3994 case FS_OPCODE_CINTERP: 3995 brw_MOV(p, dst, src[0]); 3996 break; 3997 case FS_OPCODE_LINTERP: 3998 generate_linterp(inst, dst, src); 3999 break; 4000 case FS_OPCODE_TEX: 4001 case FS_OPCODE_TXB: 4002 case FS_OPCODE_TXD: 4003 case FS_OPCODE_TXL: 4004 generate_tex(inst, dst, src[0]); 4005 break; 4006 case FS_OPCODE_DISCARD: 4007 generate_discard(inst); 4008 break; 4009 case FS_OPCODE_DDX: 4010 generate_ddx(inst, dst, src[0]); 4011 break; 4012 case FS_OPCODE_DDY: 4013 generate_ddy(inst, dst, src[0]); 4014 break; 4015 4016 case FS_OPCODE_SPILL: 4017 generate_spill(inst, src[0]); 4018 break; 4019 4020 case FS_OPCODE_UNSPILL: 4021 generate_unspill(inst, dst); 4022 break; 4023 4024 case FS_OPCODE_PULL_CONSTANT_LOAD: 4025 generate_pull_constant_load(inst, dst); 4026 break; 4027 4028 case FS_OPCODE_FB_WRITE: 4029 generate_fb_write(inst); 4030 break; 4031 default: 4032 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) { 4033 _mesa_problem(ctx, "Unsupported opcode `%s' in FS", 4034 brw_opcodes[inst->opcode].name); 4035 } else { 4036 _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode); 4037 } 4038 fail("unsupported opcode in FS\n"); 4039 } 4040 4041 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 4042 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) { 4043 if (0) { 4044 printf("0x%08x 0x%08x 0x%08x 0x%08x ", 4045 ((uint32_t *)&p->store[i])[3], 4046 ((uint32_t *)&p->store[i])[2], 4047 ((uint32_t *)&p->store[i])[1], 4048 ((uint32_t *)&p->store[i])[0]); 4049 } 4050 brw_disasm(stdout, &p->store[i], intel->gen); 4051 } 4052 } 4053 4054 last_native_inst = p->nr_insn; 4055 } 4056 4057 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 4058 printf("\n"); 4059 } 4060 4061 ralloc_free(loop_stack); 4062 ralloc_free(if_depth_in_loop); 4063 4064 brw_set_uip_jip(p); 4065 4066 /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS 4067 * emit issues, it doesn't get the jump distances into the output, 4068 * which is often something we want to debug. So this is here in 4069 * case you're doing that. 4070 */ 4071 if (0) { 4072 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 4073 for (unsigned int i = 0; i < p->nr_insn; i++) { 4074 printf("0x%08x 0x%08x 0x%08x 0x%08x ", 4075 ((uint32_t *)&p->store[i])[3], 4076 ((uint32_t *)&p->store[i])[2], 4077 ((uint32_t *)&p->store[i])[1], 4078 ((uint32_t *)&p->store[i])[0]); 4079 brw_disasm(stdout, &p->store[i], intel->gen); 4080 } 4081 } 4082 } 4083} 4084 4085bool 4086fs_visitor::run() 4087{ 4088 uint32_t prog_offset_16 = 0; 4089 uint32_t orig_nr_params = c->prog_data.nr_params; 4090 4091 brw_wm_payload_setup(brw, c); 4092 4093 if (c->dispatch_width == 16) { 4094 /* align to 64 byte boundary. */ 4095 while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) { 4096 brw_NOP(p); 4097 } 4098 4099 /* Save off the start of this 16-wide program in case we succeed. */ 4100 prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction); 4101 4102 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 4103 } 4104 4105 if (0) { 4106 emit_dummy_fs(); 4107 } else { 4108 calculate_urb_setup(); 4109 if (intel->gen < 6) 4110 emit_interpolation_setup_gen4(); 4111 else 4112 emit_interpolation_setup_gen6(); 4113 4114 /* Generate FS IR for main(). (the visitor only descends into 4115 * functions called "main"). 4116 */ 4117 foreach_iter(exec_list_iterator, iter, *shader->ir) { 4118 ir_instruction *ir = (ir_instruction *)iter.get(); 4119 base_ir = ir; 4120 ir->accept(this); 4121 } 4122 4123 emit_fb_writes(); 4124 4125 split_virtual_grfs(); 4126 4127 setup_paramvalues_refs(); 4128 setup_pull_constants(); 4129 4130 bool progress; 4131 do { 4132 progress = false; 4133 4134 progress = remove_duplicate_mrf_writes() || progress; 4135 4136 progress = propagate_constants() || progress; 4137 progress = register_coalesce() || progress; 4138 progress = compute_to_mrf() || progress; 4139 progress = dead_code_eliminate() || progress; 4140 } while (progress); 4141 4142 schedule_instructions(); 4143 4144 assign_curb_setup(); 4145 assign_urb_setup(); 4146 4147 if (0) { 4148 /* Debug of register spilling: Go spill everything. */ 4149 int virtual_grf_count = virtual_grf_next; 4150 for (int i = 1; i < virtual_grf_count; i++) { 4151 spill_reg(i); 4152 } 4153 } 4154 4155 if (0) 4156 assign_regs_trivial(); 4157 else { 4158 while (!assign_regs()) { 4159 if (failed) 4160 break; 4161 } 4162 } 4163 } 4164 assert(force_uncompressed_stack == 0); 4165 assert(force_sechalf_stack == 0); 4166 4167 if (failed) 4168 return false; 4169 4170 generate_code(); 4171 4172 if (c->dispatch_width == 8) { 4173 c->prog_data.total_grf = grf_used; 4174 } else { 4175 c->prog_data.total_grf_16 = grf_used; 4176 c->prog_data.prog_offset_16 = prog_offset_16; 4177 4178 /* Make sure we didn't try to sneak in an extra uniform */ 4179 assert(orig_nr_params == c->prog_data.nr_params); 4180 } 4181 4182 return !failed; 4183} 4184 4185bool 4186brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c) 4187{ 4188 struct intel_context *intel = &brw->intel; 4189 struct gl_context *ctx = &intel->ctx; 4190 struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram; 4191 4192 if (!prog) 4193 return false; 4194 4195 struct brw_shader *shader = 4196 (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; 4197 if (!shader) 4198 return false; 4199 4200 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 4201 printf("GLSL IR for native fragment shader %d:\n", prog->Name); 4202 _mesa_print_ir(shader->ir, NULL); 4203 printf("\n\n"); 4204 } 4205 4206 /* Now the main event: Visit the shader IR and generate our FS IR for it. 4207 */ 4208 c->dispatch_width = 8; 4209 4210 fs_visitor v(c, shader); 4211 if (!v.run()) { 4212 /* FINISHME: Cleanly fail, test at link time, etc. */ 4213 assert(!"not reached"); 4214 return false; 4215 } 4216 4217 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) { 4218 c->dispatch_width = 16; 4219 fs_visitor v2(c, shader); 4220 v2.import_uniforms(v.variable_ht); 4221 v2.run(); 4222 } 4223 4224 c->prog_data.dispatch_width = 8; 4225 4226 return true; 4227} 4228