brw_fs.cpp revision ff6e3c73f6553cd29b915497b5b00e3ef158a27d
1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Eric Anholt <eric@anholt.net> 25 * 26 */ 27 28extern "C" { 29 30#include <sys/types.h> 31 32#include "main/macros.h" 33#include "main/shaderobj.h" 34#include "main/uniforms.h" 35#include "program/prog_parameter.h" 36#include "program/prog_print.h" 37#include "program/prog_optimize.h" 38#include "program/register_allocate.h" 39#include "program/sampler.h" 40#include "program/hash_table.h" 41#include "brw_context.h" 42#include "brw_eu.h" 43#include "brw_wm.h" 44} 45#include "brw_fs.h" 46#include "../glsl/glsl_types.h" 47#include "../glsl/ir_optimization.h" 48#include "../glsl/ir_print_visitor.h" 49 50#define MAX_INSTRUCTION (1 << 30) 51static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg); 52 53struct gl_shader * 54brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type) 55{ 56 struct brw_shader *shader; 57 58 shader = rzalloc(NULL, struct brw_shader); 59 if (shader) { 60 shader->base.Type = type; 61 shader->base.Name = name; 62 _mesa_init_shader(ctx, &shader->base); 63 } 64 65 return &shader->base; 66} 67 68struct gl_shader_program * 69brw_new_shader_program(struct gl_context *ctx, GLuint name) 70{ 71 struct brw_shader_program *prog; 72 prog = rzalloc(NULL, struct brw_shader_program); 73 if (prog) { 74 prog->base.Name = name; 75 _mesa_init_shader_program(ctx, &prog->base); 76 } 77 return &prog->base; 78} 79 80GLboolean 81brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog) 82{ 83 struct brw_context *brw = brw_context(ctx); 84 struct intel_context *intel = &brw->intel; 85 86 struct brw_shader *shader = 87 (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; 88 if (shader != NULL) { 89 void *mem_ctx = ralloc_context(NULL); 90 bool progress; 91 92 if (shader->ir) 93 ralloc_free(shader->ir); 94 shader->ir = new(shader) exec_list; 95 clone_ir_list(mem_ctx, shader->ir, shader->base.ir); 96 97 do_mat_op_to_vec(shader->ir); 98 lower_instructions(shader->ir, 99 MOD_TO_FRACT | 100 DIV_TO_MUL_RCP | 101 SUB_TO_ADD_NEG | 102 EXP_TO_EXP2 | 103 LOG_TO_LOG2); 104 105 /* Pre-gen6 HW can only nest if-statements 16 deep. Beyond this, 106 * if-statements need to be flattened. 107 */ 108 if (intel->gen < 6) 109 lower_if_to_cond_assign(shader->ir, 16); 110 111 do_lower_texture_projection(shader->ir); 112 do_vec_index_to_cond_assign(shader->ir); 113 brw_do_cubemap_normalize(shader->ir); 114 lower_noise(shader->ir); 115 lower_quadop_vector(shader->ir, false); 116 lower_variable_index_to_cond_assign(shader->ir, 117 GL_TRUE, /* input */ 118 GL_TRUE, /* output */ 119 GL_TRUE, /* temp */ 120 GL_TRUE /* uniform */ 121 ); 122 123 do { 124 progress = false; 125 126 brw_do_channel_expressions(shader->ir); 127 brw_do_vector_splitting(shader->ir); 128 129 progress = do_lower_jumps(shader->ir, true, true, 130 true, /* main return */ 131 false, /* continue */ 132 false /* loops */ 133 ) || progress; 134 135 progress = do_common_optimization(shader->ir, true, 32) || progress; 136 } while (progress); 137 138 validate_ir_tree(shader->ir); 139 140 reparent_ir(shader->ir, shader->ir); 141 ralloc_free(mem_ctx); 142 } 143 144 if (!_mesa_ir_link_shader(ctx, prog)) 145 return GL_FALSE; 146 147 return GL_TRUE; 148} 149 150static int 151type_size(const struct glsl_type *type) 152{ 153 unsigned int size, i; 154 155 switch (type->base_type) { 156 case GLSL_TYPE_UINT: 157 case GLSL_TYPE_INT: 158 case GLSL_TYPE_FLOAT: 159 case GLSL_TYPE_BOOL: 160 return type->components(); 161 case GLSL_TYPE_ARRAY: 162 return type_size(type->fields.array) * type->length; 163 case GLSL_TYPE_STRUCT: 164 size = 0; 165 for (i = 0; i < type->length; i++) { 166 size += type_size(type->fields.structure[i].type); 167 } 168 return size; 169 case GLSL_TYPE_SAMPLER: 170 /* Samplers take up no register space, since they're baked in at 171 * link time. 172 */ 173 return 0; 174 default: 175 assert(!"not reached"); 176 return 0; 177 } 178} 179 180void 181fs_visitor::fail(const char *format, ...) 182{ 183 if (!failed) { 184 failed = true; 185 186 if (INTEL_DEBUG & DEBUG_WM) { 187 fprintf(stderr, "FS compile failed: "); 188 189 va_list va; 190 va_start(va, format); 191 vfprintf(stderr, format, va); 192 va_end(va); 193 } 194 } 195} 196 197void 198fs_visitor::push_force_uncompressed() 199{ 200 force_uncompressed_stack++; 201} 202 203void 204fs_visitor::pop_force_uncompressed() 205{ 206 force_uncompressed_stack--; 207 assert(force_uncompressed_stack >= 0); 208} 209 210void 211fs_visitor::push_force_sechalf() 212{ 213 force_sechalf_stack++; 214} 215 216void 217fs_visitor::pop_force_sechalf() 218{ 219 force_sechalf_stack--; 220 assert(force_sechalf_stack >= 0); 221} 222 223/** 224 * Returns how many MRFs an FS opcode will write over. 225 * 226 * Note that this is not the 0 or 1 implied writes in an actual gen 227 * instruction -- the FS opcodes often generate MOVs in addition. 228 */ 229int 230fs_visitor::implied_mrf_writes(fs_inst *inst) 231{ 232 if (inst->mlen == 0) 233 return 0; 234 235 switch (inst->opcode) { 236 case FS_OPCODE_RCP: 237 case FS_OPCODE_RSQ: 238 case FS_OPCODE_SQRT: 239 case FS_OPCODE_EXP2: 240 case FS_OPCODE_LOG2: 241 case FS_OPCODE_SIN: 242 case FS_OPCODE_COS: 243 return 1 * c->dispatch_width / 8; 244 case FS_OPCODE_POW: 245 return 2 * c->dispatch_width / 8; 246 case FS_OPCODE_TEX: 247 case FS_OPCODE_TXB: 248 case FS_OPCODE_TXD: 249 case FS_OPCODE_TXL: 250 return 1; 251 case FS_OPCODE_FB_WRITE: 252 return 2; 253 case FS_OPCODE_PULL_CONSTANT_LOAD: 254 case FS_OPCODE_UNSPILL: 255 return 1; 256 case FS_OPCODE_SPILL: 257 return 2; 258 default: 259 assert(!"not reached"); 260 return inst->mlen; 261 } 262} 263 264int 265fs_visitor::virtual_grf_alloc(int size) 266{ 267 if (virtual_grf_array_size <= virtual_grf_next) { 268 if (virtual_grf_array_size == 0) 269 virtual_grf_array_size = 16; 270 else 271 virtual_grf_array_size *= 2; 272 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int, 273 virtual_grf_array_size); 274 275 /* This slot is always unused. */ 276 virtual_grf_sizes[0] = 0; 277 } 278 virtual_grf_sizes[virtual_grf_next] = size; 279 return virtual_grf_next++; 280} 281 282/** Fixed HW reg constructor. */ 283fs_reg::fs_reg(enum register_file file, int hw_reg) 284{ 285 init(); 286 this->file = file; 287 this->hw_reg = hw_reg; 288 this->type = BRW_REGISTER_TYPE_F; 289} 290 291/** Fixed HW reg constructor. */ 292fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type) 293{ 294 init(); 295 this->file = file; 296 this->hw_reg = hw_reg; 297 this->type = type; 298} 299 300int 301brw_type_for_base_type(const struct glsl_type *type) 302{ 303 switch (type->base_type) { 304 case GLSL_TYPE_FLOAT: 305 return BRW_REGISTER_TYPE_F; 306 case GLSL_TYPE_INT: 307 case GLSL_TYPE_BOOL: 308 return BRW_REGISTER_TYPE_D; 309 case GLSL_TYPE_UINT: 310 return BRW_REGISTER_TYPE_UD; 311 case GLSL_TYPE_ARRAY: 312 case GLSL_TYPE_STRUCT: 313 case GLSL_TYPE_SAMPLER: 314 /* These should be overridden with the type of the member when 315 * dereferenced into. BRW_REGISTER_TYPE_UD seems like a likely 316 * way to trip up if we don't. 317 */ 318 return BRW_REGISTER_TYPE_UD; 319 default: 320 assert(!"not reached"); 321 return BRW_REGISTER_TYPE_F; 322 } 323} 324 325/** Automatic reg constructor. */ 326fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type) 327{ 328 init(); 329 330 this->file = GRF; 331 this->reg = v->virtual_grf_alloc(type_size(type)); 332 this->reg_offset = 0; 333 this->type = brw_type_for_base_type(type); 334} 335 336fs_reg * 337fs_visitor::variable_storage(ir_variable *var) 338{ 339 return (fs_reg *)hash_table_find(this->variable_ht, var); 340} 341 342void 343import_uniforms_callback(const void *key, 344 void *data, 345 void *closure) 346{ 347 struct hash_table *dst_ht = (struct hash_table *)closure; 348 const fs_reg *reg = (const fs_reg *)data; 349 350 if (reg->file != UNIFORM) 351 return; 352 353 hash_table_insert(dst_ht, data, key); 354} 355 356/* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch. 357 * This brings in those uniform definitions 358 */ 359void 360fs_visitor::import_uniforms(struct hash_table *src_variable_ht) 361{ 362 hash_table_call_foreach(src_variable_ht, 363 import_uniforms_callback, 364 variable_ht); 365} 366 367/* Our support for uniforms is piggy-backed on the struct 368 * gl_fragment_program, because that's where the values actually 369 * get stored, rather than in some global gl_shader_program uniform 370 * store. 371 */ 372int 373fs_visitor::setup_uniform_values(int loc, const glsl_type *type) 374{ 375 unsigned int offset = 0; 376 377 if (type->is_matrix()) { 378 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT, 379 type->vector_elements, 380 1); 381 382 for (unsigned int i = 0; i < type->matrix_columns; i++) { 383 offset += setup_uniform_values(loc + offset, column); 384 } 385 386 return offset; 387 } 388 389 switch (type->base_type) { 390 case GLSL_TYPE_FLOAT: 391 case GLSL_TYPE_UINT: 392 case GLSL_TYPE_INT: 393 case GLSL_TYPE_BOOL: 394 for (unsigned int i = 0; i < type->vector_elements; i++) { 395 unsigned int param = c->prog_data.nr_params++; 396 397 assert(param < ARRAY_SIZE(c->prog_data.param)); 398 399 switch (type->base_type) { 400 case GLSL_TYPE_FLOAT: 401 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 402 break; 403 case GLSL_TYPE_UINT: 404 c->prog_data.param_convert[param] = PARAM_CONVERT_F2U; 405 break; 406 case GLSL_TYPE_INT: 407 c->prog_data.param_convert[param] = PARAM_CONVERT_F2I; 408 break; 409 case GLSL_TYPE_BOOL: 410 c->prog_data.param_convert[param] = PARAM_CONVERT_F2B; 411 break; 412 default: 413 assert(!"not reached"); 414 c->prog_data.param_convert[param] = PARAM_NO_CONVERT; 415 break; 416 } 417 this->param_index[param] = loc; 418 this->param_offset[param] = i; 419 } 420 return 1; 421 422 case GLSL_TYPE_STRUCT: 423 for (unsigned int i = 0; i < type->length; i++) { 424 offset += setup_uniform_values(loc + offset, 425 type->fields.structure[i].type); 426 } 427 return offset; 428 429 case GLSL_TYPE_ARRAY: 430 for (unsigned int i = 0; i < type->length; i++) { 431 offset += setup_uniform_values(loc + offset, type->fields.array); 432 } 433 return offset; 434 435 case GLSL_TYPE_SAMPLER: 436 /* The sampler takes up a slot, but we don't use any values from it. */ 437 return 1; 438 439 default: 440 assert(!"not reached"); 441 return 0; 442 } 443} 444 445 446/* Our support for builtin uniforms is even scarier than non-builtin. 447 * It sits on top of the PROG_STATE_VAR parameters that are 448 * automatically updated from GL context state. 449 */ 450void 451fs_visitor::setup_builtin_uniform_values(ir_variable *ir) 452{ 453 const ir_state_slot *const slots = ir->state_slots; 454 assert(ir->state_slots != NULL); 455 456 for (unsigned int i = 0; i < ir->num_state_slots; i++) { 457 /* This state reference has already been setup by ir_to_mesa, but we'll 458 * get the same index back here. 459 */ 460 int index = _mesa_add_state_reference(this->fp->Base.Parameters, 461 (gl_state_index *)slots[i].tokens); 462 463 /* Add each of the unique swizzles of the element as a parameter. 464 * This'll end up matching the expected layout of the 465 * array/matrix/structure we're trying to fill in. 466 */ 467 int last_swiz = -1; 468 for (unsigned int j = 0; j < 4; j++) { 469 int swiz = GET_SWZ(slots[i].swizzle, j); 470 if (swiz == last_swiz) 471 break; 472 last_swiz = swiz; 473 474 c->prog_data.param_convert[c->prog_data.nr_params] = 475 PARAM_NO_CONVERT; 476 this->param_index[c->prog_data.nr_params] = index; 477 this->param_offset[c->prog_data.nr_params] = swiz; 478 c->prog_data.nr_params++; 479 } 480 } 481} 482 483fs_reg * 484fs_visitor::emit_fragcoord_interpolation(ir_variable *ir) 485{ 486 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 487 fs_reg wpos = *reg; 488 fs_reg neg_y = this->pixel_y; 489 neg_y.negate = true; 490 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo; 491 492 /* gl_FragCoord.x */ 493 if (ir->pixel_center_integer) { 494 emit(BRW_OPCODE_MOV, wpos, this->pixel_x); 495 } else { 496 emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f)); 497 } 498 wpos.reg_offset++; 499 500 /* gl_FragCoord.y */ 501 if (!flip && ir->pixel_center_integer) { 502 emit(BRW_OPCODE_MOV, wpos, this->pixel_y); 503 } else { 504 fs_reg pixel_y = this->pixel_y; 505 float offset = (ir->pixel_center_integer ? 0.0 : 0.5); 506 507 if (flip) { 508 pixel_y.negate = true; 509 offset += c->key.drawable_height - 1.0; 510 } 511 512 emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset)); 513 } 514 wpos.reg_offset++; 515 516 /* gl_FragCoord.z */ 517 if (intel->gen >= 6) { 518 emit(BRW_OPCODE_MOV, wpos, 519 fs_reg(brw_vec8_grf(c->source_depth_reg, 0))); 520 } else { 521 emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y, 522 interp_reg(FRAG_ATTRIB_WPOS, 2)); 523 } 524 wpos.reg_offset++; 525 526 /* gl_FragCoord.w: Already set up in emit_interpolation */ 527 emit(BRW_OPCODE_MOV, wpos, this->wpos_w); 528 529 return reg; 530} 531 532fs_reg * 533fs_visitor::emit_general_interpolation(ir_variable *ir) 534{ 535 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 536 /* Interpolation is always in floating point regs. */ 537 reg->type = BRW_REGISTER_TYPE_F; 538 fs_reg attr = *reg; 539 540 unsigned int array_elements; 541 const glsl_type *type; 542 543 if (ir->type->is_array()) { 544 array_elements = ir->type->length; 545 if (array_elements == 0) { 546 fail("dereferenced array '%s' has length 0\n", ir->name); 547 } 548 type = ir->type->fields.array; 549 } else { 550 array_elements = 1; 551 type = ir->type; 552 } 553 554 int location = ir->location; 555 for (unsigned int i = 0; i < array_elements; i++) { 556 for (unsigned int j = 0; j < type->matrix_columns; j++) { 557 if (urb_setup[location] == -1) { 558 /* If there's no incoming setup data for this slot, don't 559 * emit interpolation for it. 560 */ 561 attr.reg_offset += type->vector_elements; 562 location++; 563 continue; 564 } 565 566 bool is_gl_Color = 567 location == FRAG_ATTRIB_COL0 || location == FRAG_ATTRIB_COL1; 568 569 if (c->key.flat_shade && is_gl_Color) { 570 /* Constant interpolation (flat shading) case. The SF has 571 * handed us defined values in only the constant offset 572 * field of the setup reg. 573 */ 574 for (unsigned int k = 0; k < type->vector_elements; k++) { 575 struct brw_reg interp = interp_reg(location, k); 576 interp = suboffset(interp, 3); 577 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp)); 578 attr.reg_offset++; 579 } 580 } else { 581 /* Perspective interpolation case. */ 582 for (unsigned int k = 0; k < type->vector_elements; k++) { 583 struct brw_reg interp = interp_reg(location, k); 584 emit(FS_OPCODE_LINTERP, attr, 585 this->delta_x, this->delta_y, fs_reg(interp)); 586 attr.reg_offset++; 587 } 588 589 if (intel->gen < 6 && !(is_gl_Color && c->key.linear_color)) { 590 attr.reg_offset -= type->vector_elements; 591 for (unsigned int k = 0; k < type->vector_elements; k++) { 592 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w); 593 attr.reg_offset++; 594 } 595 } 596 } 597 location++; 598 } 599 } 600 601 return reg; 602} 603 604fs_reg * 605fs_visitor::emit_frontfacing_interpolation(ir_variable *ir) 606{ 607 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); 608 609 /* The frontfacing comes in as a bit in the thread payload. */ 610 if (intel->gen >= 6) { 611 emit(BRW_OPCODE_ASR, *reg, 612 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)), 613 fs_reg(15)); 614 emit(BRW_OPCODE_NOT, *reg, *reg); 615 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1)); 616 } else { 617 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); 618 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives 619 * us front face 620 */ 621 fs_inst *inst = emit(BRW_OPCODE_CMP, *reg, 622 fs_reg(r1_6ud), 623 fs_reg(1u << 31)); 624 inst->conditional_mod = BRW_CONDITIONAL_L; 625 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u)); 626 } 627 628 return reg; 629} 630 631fs_inst * 632fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src) 633{ 634 switch (opcode) { 635 case FS_OPCODE_RCP: 636 case FS_OPCODE_RSQ: 637 case FS_OPCODE_SQRT: 638 case FS_OPCODE_EXP2: 639 case FS_OPCODE_LOG2: 640 case FS_OPCODE_SIN: 641 case FS_OPCODE_COS: 642 break; 643 default: 644 assert(!"not reached: bad math opcode"); 645 return NULL; 646 } 647 648 /* Can't do hstride == 0 args to gen6 math, so expand it out. We 649 * might be able to do better by doing execsize = 1 math and then 650 * expanding that result out, but we would need to be careful with 651 * masking. 652 * 653 * The hardware ignores source modifiers (negate and abs) on math 654 * instructions, so we also move to a temp to set those up. 655 */ 656 if (intel->gen >= 6 && (src.file == UNIFORM || 657 src.abs || 658 src.negate)) { 659 fs_reg expanded = fs_reg(this, glsl_type::float_type); 660 emit(BRW_OPCODE_MOV, expanded, src); 661 src = expanded; 662 } 663 664 fs_inst *inst = emit(opcode, dst, src); 665 666 if (intel->gen < 6) { 667 inst->base_mrf = 2; 668 inst->mlen = c->dispatch_width / 8; 669 } 670 671 return inst; 672} 673 674fs_inst * 675fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1) 676{ 677 int base_mrf = 2; 678 fs_inst *inst; 679 680 assert(opcode == FS_OPCODE_POW); 681 682 if (intel->gen >= 6) { 683 /* Can't do hstride == 0 args to gen6 math, so expand it out. 684 * 685 * The hardware ignores source modifiers (negate and abs) on math 686 * instructions, so we also move to a temp to set those up. 687 */ 688 if (src0.file == UNIFORM || src0.abs || src0.negate) { 689 fs_reg expanded = fs_reg(this, glsl_type::float_type); 690 emit(BRW_OPCODE_MOV, expanded, src0); 691 src0 = expanded; 692 } 693 694 if (src1.file == UNIFORM || src1.abs || src1.negate) { 695 fs_reg expanded = fs_reg(this, glsl_type::float_type); 696 emit(BRW_OPCODE_MOV, expanded, src1); 697 src1 = expanded; 698 } 699 700 inst = emit(opcode, dst, src0, src1); 701 } else { 702 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1); 703 inst = emit(opcode, dst, src0, reg_null_f); 704 705 inst->base_mrf = base_mrf; 706 inst->mlen = 2 * c->dispatch_width / 8; 707 } 708 return inst; 709} 710 711void 712fs_visitor::visit(ir_variable *ir) 713{ 714 fs_reg *reg = NULL; 715 716 if (variable_storage(ir)) 717 return; 718 719 if (strcmp(ir->name, "gl_FragColor") == 0) { 720 this->frag_color = ir; 721 } else if (strcmp(ir->name, "gl_FragData") == 0) { 722 this->frag_data = ir; 723 } else if (strcmp(ir->name, "gl_FragDepth") == 0) { 724 this->frag_depth = ir; 725 } 726 727 if (ir->mode == ir_var_in) { 728 if (!strcmp(ir->name, "gl_FragCoord")) { 729 reg = emit_fragcoord_interpolation(ir); 730 } else if (!strcmp(ir->name, "gl_FrontFacing")) { 731 reg = emit_frontfacing_interpolation(ir); 732 } else { 733 reg = emit_general_interpolation(ir); 734 } 735 assert(reg); 736 hash_table_insert(this->variable_ht, reg, ir); 737 return; 738 } 739 740 if (ir->mode == ir_var_uniform) { 741 int param_index = c->prog_data.nr_params; 742 743 if (c->dispatch_width == 16) { 744 if (!variable_storage(ir)) { 745 fail("Failed to find uniform '%s' in 16-wide\n", ir->name); 746 } 747 return; 748 } 749 750 if (!strncmp(ir->name, "gl_", 3)) { 751 setup_builtin_uniform_values(ir); 752 } else { 753 setup_uniform_values(ir->location, ir->type); 754 } 755 756 reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index); 757 reg->type = brw_type_for_base_type(ir->type); 758 } 759 760 if (!reg) 761 reg = new(this->mem_ctx) fs_reg(this, ir->type); 762 763 hash_table_insert(this->variable_ht, reg, ir); 764} 765 766void 767fs_visitor::visit(ir_dereference_variable *ir) 768{ 769 fs_reg *reg = variable_storage(ir->var); 770 this->result = *reg; 771} 772 773void 774fs_visitor::visit(ir_dereference_record *ir) 775{ 776 const glsl_type *struct_type = ir->record->type; 777 778 ir->record->accept(this); 779 780 unsigned int offset = 0; 781 for (unsigned int i = 0; i < struct_type->length; i++) { 782 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0) 783 break; 784 offset += type_size(struct_type->fields.structure[i].type); 785 } 786 this->result.reg_offset += offset; 787 this->result.type = brw_type_for_base_type(ir->type); 788} 789 790void 791fs_visitor::visit(ir_dereference_array *ir) 792{ 793 ir_constant *index; 794 int element_size; 795 796 ir->array->accept(this); 797 index = ir->array_index->as_constant(); 798 799 element_size = type_size(ir->type); 800 this->result.type = brw_type_for_base_type(ir->type); 801 802 if (index) { 803 assert(this->result.file == UNIFORM || 804 (this->result.file == GRF && 805 this->result.reg != 0)); 806 this->result.reg_offset += index->value.i[0] * element_size; 807 } else { 808 assert(!"FINISHME: non-constant array element"); 809 } 810} 811 812/* Instruction selection: Produce a MOV.sat instead of 813 * MIN(MAX(val, 0), 1) when possible. 814 */ 815bool 816fs_visitor::try_emit_saturate(ir_expression *ir) 817{ 818 ir_rvalue *sat_val = ir->as_rvalue_to_saturate(); 819 820 if (!sat_val) 821 return false; 822 823 sat_val->accept(this); 824 fs_reg src = this->result; 825 826 this->result = fs_reg(this, ir->type); 827 fs_inst *inst = emit(BRW_OPCODE_MOV, this->result, src); 828 inst->saturate = true; 829 830 return true; 831} 832 833static uint32_t 834brw_conditional_for_comparison(unsigned int op) 835{ 836 switch (op) { 837 case ir_binop_less: 838 return BRW_CONDITIONAL_L; 839 case ir_binop_greater: 840 return BRW_CONDITIONAL_G; 841 case ir_binop_lequal: 842 return BRW_CONDITIONAL_LE; 843 case ir_binop_gequal: 844 return BRW_CONDITIONAL_GE; 845 case ir_binop_equal: 846 case ir_binop_all_equal: /* same as equal for scalars */ 847 return BRW_CONDITIONAL_Z; 848 case ir_binop_nequal: 849 case ir_binop_any_nequal: /* same as nequal for scalars */ 850 return BRW_CONDITIONAL_NZ; 851 default: 852 assert(!"not reached: bad operation for comparison"); 853 return BRW_CONDITIONAL_NZ; 854 } 855} 856 857void 858fs_visitor::visit(ir_expression *ir) 859{ 860 unsigned int operand; 861 fs_reg op[2], temp; 862 fs_inst *inst; 863 864 assert(ir->get_num_operands() <= 2); 865 866 if (try_emit_saturate(ir)) 867 return; 868 869 for (operand = 0; operand < ir->get_num_operands(); operand++) { 870 ir->operands[operand]->accept(this); 871 if (this->result.file == BAD_FILE) { 872 ir_print_visitor v; 873 fail("Failed to get tree for expression operand:\n"); 874 ir->operands[operand]->accept(&v); 875 } 876 op[operand] = this->result; 877 878 /* Matrix expression operands should have been broken down to vector 879 * operations already. 880 */ 881 assert(!ir->operands[operand]->type->is_matrix()); 882 /* And then those vector operands should have been broken down to scalar. 883 */ 884 assert(!ir->operands[operand]->type->is_vector()); 885 } 886 887 /* Storage for our result. If our result goes into an assignment, it will 888 * just get copy-propagated out, so no worries. 889 */ 890 this->result = fs_reg(this, ir->type); 891 892 switch (ir->operation) { 893 case ir_unop_logic_not: 894 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is 895 * ones complement of the whole register, not just bit 0. 896 */ 897 emit(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1)); 898 break; 899 case ir_unop_neg: 900 op[0].negate = !op[0].negate; 901 this->result = op[0]; 902 break; 903 case ir_unop_abs: 904 op[0].abs = true; 905 op[0].negate = false; 906 this->result = op[0]; 907 break; 908 case ir_unop_sign: 909 temp = fs_reg(this, ir->type); 910 911 emit(BRW_OPCODE_MOV, this->result, fs_reg(0.0f)); 912 913 inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f)); 914 inst->conditional_mod = BRW_CONDITIONAL_G; 915 inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(1.0f)); 916 inst->predicated = true; 917 918 inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f)); 919 inst->conditional_mod = BRW_CONDITIONAL_L; 920 inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f)); 921 inst->predicated = true; 922 923 break; 924 case ir_unop_rcp: 925 emit_math(FS_OPCODE_RCP, this->result, op[0]); 926 break; 927 928 case ir_unop_exp2: 929 emit_math(FS_OPCODE_EXP2, this->result, op[0]); 930 break; 931 case ir_unop_log2: 932 emit_math(FS_OPCODE_LOG2, this->result, op[0]); 933 break; 934 case ir_unop_exp: 935 case ir_unop_log: 936 assert(!"not reached: should be handled by ir_explog_to_explog2"); 937 break; 938 case ir_unop_sin: 939 case ir_unop_sin_reduced: 940 emit_math(FS_OPCODE_SIN, this->result, op[0]); 941 break; 942 case ir_unop_cos: 943 case ir_unop_cos_reduced: 944 emit_math(FS_OPCODE_COS, this->result, op[0]); 945 break; 946 947 case ir_unop_dFdx: 948 emit(FS_OPCODE_DDX, this->result, op[0]); 949 break; 950 case ir_unop_dFdy: 951 emit(FS_OPCODE_DDY, this->result, op[0]); 952 break; 953 954 case ir_binop_add: 955 emit(BRW_OPCODE_ADD, this->result, op[0], op[1]); 956 break; 957 case ir_binop_sub: 958 assert(!"not reached: should be handled by ir_sub_to_add_neg"); 959 break; 960 961 case ir_binop_mul: 962 emit(BRW_OPCODE_MUL, this->result, op[0], op[1]); 963 break; 964 case ir_binop_div: 965 assert(!"not reached: should be handled by ir_div_to_mul_rcp"); 966 break; 967 case ir_binop_mod: 968 assert(!"ir_binop_mod should have been converted to b * fract(a/b)"); 969 break; 970 971 case ir_binop_less: 972 case ir_binop_greater: 973 case ir_binop_lequal: 974 case ir_binop_gequal: 975 case ir_binop_equal: 976 case ir_binop_all_equal: 977 case ir_binop_nequal: 978 case ir_binop_any_nequal: 979 temp = this->result; 980 /* original gen4 does implicit conversion before comparison. */ 981 if (intel->gen < 5) 982 temp.type = op[0].type; 983 984 inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]); 985 inst->conditional_mod = brw_conditional_for_comparison(ir->operation); 986 emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)); 987 break; 988 989 case ir_binop_logic_xor: 990 emit(BRW_OPCODE_XOR, this->result, op[0], op[1]); 991 break; 992 993 case ir_binop_logic_or: 994 emit(BRW_OPCODE_OR, this->result, op[0], op[1]); 995 break; 996 997 case ir_binop_logic_and: 998 emit(BRW_OPCODE_AND, this->result, op[0], op[1]); 999 break; 1000 1001 case ir_binop_dot: 1002 case ir_unop_any: 1003 assert(!"not reached: should be handled by brw_fs_channel_expressions"); 1004 break; 1005 1006 case ir_unop_noise: 1007 assert(!"not reached: should be handled by lower_noise"); 1008 break; 1009 1010 case ir_quadop_vector: 1011 assert(!"not reached: should be handled by lower_quadop_vector"); 1012 break; 1013 1014 case ir_unop_sqrt: 1015 emit_math(FS_OPCODE_SQRT, this->result, op[0]); 1016 break; 1017 1018 case ir_unop_rsq: 1019 emit_math(FS_OPCODE_RSQ, this->result, op[0]); 1020 break; 1021 1022 case ir_unop_i2f: 1023 case ir_unop_b2f: 1024 case ir_unop_b2i: 1025 case ir_unop_f2i: 1026 emit(BRW_OPCODE_MOV, this->result, op[0]); 1027 break; 1028 case ir_unop_f2b: 1029 case ir_unop_i2b: 1030 temp = this->result; 1031 /* original gen4 does implicit conversion before comparison. */ 1032 if (intel->gen < 5) 1033 temp.type = op[0].type; 1034 1035 inst = emit(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f)); 1036 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1037 inst = emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1)); 1038 break; 1039 1040 case ir_unop_trunc: 1041 emit(BRW_OPCODE_RNDZ, this->result, op[0]); 1042 break; 1043 case ir_unop_ceil: 1044 op[0].negate = !op[0].negate; 1045 inst = emit(BRW_OPCODE_RNDD, this->result, op[0]); 1046 this->result.negate = true; 1047 break; 1048 case ir_unop_floor: 1049 inst = emit(BRW_OPCODE_RNDD, this->result, op[0]); 1050 break; 1051 case ir_unop_fract: 1052 inst = emit(BRW_OPCODE_FRC, this->result, op[0]); 1053 break; 1054 case ir_unop_round_even: 1055 emit(BRW_OPCODE_RNDE, this->result, op[0]); 1056 break; 1057 1058 case ir_binop_min: 1059 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]); 1060 inst->conditional_mod = BRW_CONDITIONAL_L; 1061 1062 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]); 1063 inst->predicated = true; 1064 break; 1065 case ir_binop_max: 1066 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]); 1067 inst->conditional_mod = BRW_CONDITIONAL_G; 1068 1069 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]); 1070 inst->predicated = true; 1071 break; 1072 1073 case ir_binop_pow: 1074 emit_math(FS_OPCODE_POW, this->result, op[0], op[1]); 1075 break; 1076 1077 case ir_unop_bit_not: 1078 inst = emit(BRW_OPCODE_NOT, this->result, op[0]); 1079 break; 1080 case ir_binop_bit_and: 1081 inst = emit(BRW_OPCODE_AND, this->result, op[0], op[1]); 1082 break; 1083 case ir_binop_bit_xor: 1084 inst = emit(BRW_OPCODE_XOR, this->result, op[0], op[1]); 1085 break; 1086 case ir_binop_bit_or: 1087 inst = emit(BRW_OPCODE_OR, this->result, op[0], op[1]); 1088 break; 1089 1090 case ir_unop_u2f: 1091 case ir_binop_lshift: 1092 case ir_binop_rshift: 1093 assert(!"GLSL 1.30 features unsupported"); 1094 break; 1095 } 1096} 1097 1098void 1099fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r, 1100 const glsl_type *type, bool predicated) 1101{ 1102 switch (type->base_type) { 1103 case GLSL_TYPE_FLOAT: 1104 case GLSL_TYPE_UINT: 1105 case GLSL_TYPE_INT: 1106 case GLSL_TYPE_BOOL: 1107 for (unsigned int i = 0; i < type->components(); i++) { 1108 l.type = brw_type_for_base_type(type); 1109 r.type = brw_type_for_base_type(type); 1110 1111 fs_inst *inst = emit(BRW_OPCODE_MOV, l, r); 1112 inst->predicated = predicated; 1113 1114 l.reg_offset++; 1115 r.reg_offset++; 1116 } 1117 break; 1118 case GLSL_TYPE_ARRAY: 1119 for (unsigned int i = 0; i < type->length; i++) { 1120 emit_assignment_writes(l, r, type->fields.array, predicated); 1121 } 1122 break; 1123 1124 case GLSL_TYPE_STRUCT: 1125 for (unsigned int i = 0; i < type->length; i++) { 1126 emit_assignment_writes(l, r, type->fields.structure[i].type, 1127 predicated); 1128 } 1129 break; 1130 1131 case GLSL_TYPE_SAMPLER: 1132 break; 1133 1134 default: 1135 assert(!"not reached"); 1136 break; 1137 } 1138} 1139 1140void 1141fs_visitor::visit(ir_assignment *ir) 1142{ 1143 struct fs_reg l, r; 1144 fs_inst *inst; 1145 1146 /* FINISHME: arrays on the lhs */ 1147 ir->lhs->accept(this); 1148 l = this->result; 1149 1150 ir->rhs->accept(this); 1151 r = this->result; 1152 1153 assert(l.file != BAD_FILE); 1154 assert(r.file != BAD_FILE); 1155 1156 if (ir->condition) { 1157 emit_bool_to_cond_code(ir->condition); 1158 } 1159 1160 if (ir->lhs->type->is_scalar() || 1161 ir->lhs->type->is_vector()) { 1162 for (int i = 0; i < ir->lhs->type->vector_elements; i++) { 1163 if (ir->write_mask & (1 << i)) { 1164 inst = emit(BRW_OPCODE_MOV, l, r); 1165 if (ir->condition) 1166 inst->predicated = true; 1167 r.reg_offset++; 1168 } 1169 l.reg_offset++; 1170 } 1171 } else { 1172 emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL); 1173 } 1174} 1175 1176fs_inst * 1177fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate) 1178{ 1179 int mlen; 1180 int base_mrf = 1; 1181 bool simd16 = false; 1182 fs_reg orig_dst; 1183 1184 /* g0 header. */ 1185 mlen = 1; 1186 1187 if (ir->shadow_comparitor) { 1188 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1189 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate); 1190 coordinate.reg_offset++; 1191 } 1192 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */ 1193 mlen += 3; 1194 1195 if (ir->op == ir_tex) { 1196 /* There's no plain shadow compare message, so we use shadow 1197 * compare with a bias of 0.0. 1198 */ 1199 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)); 1200 mlen++; 1201 } else if (ir->op == ir_txb) { 1202 ir->lod_info.bias->accept(this); 1203 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1204 mlen++; 1205 } else { 1206 assert(ir->op == ir_txl); 1207 ir->lod_info.lod->accept(this); 1208 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1209 mlen++; 1210 } 1211 1212 ir->shadow_comparitor->accept(this); 1213 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1214 mlen++; 1215 } else if (ir->op == ir_tex) { 1216 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1217 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate); 1218 coordinate.reg_offset++; 1219 } 1220 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */ 1221 mlen += 3; 1222 } else if (ir->op == ir_txd) { 1223 assert(!"TXD isn't supported on gen4 yet."); 1224 } else { 1225 /* Oh joy. gen4 doesn't have SIMD8 non-shadow-compare bias/lod 1226 * instructions. We'll need to do SIMD16 here. 1227 */ 1228 assert(ir->op == ir_txb || ir->op == ir_txl); 1229 1230 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1231 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2), coordinate); 1232 coordinate.reg_offset++; 1233 } 1234 1235 /* lod/bias appears after u/v/r. */ 1236 mlen += 6; 1237 1238 if (ir->op == ir_txb) { 1239 ir->lod_info.bias->accept(this); 1240 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1241 mlen++; 1242 } else { 1243 ir->lod_info.lod->accept(this); 1244 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1245 mlen++; 1246 } 1247 1248 /* The unused upper half. */ 1249 mlen++; 1250 1251 /* Now, since we're doing simd16, the return is 2 interleaved 1252 * vec4s where the odd-indexed ones are junk. We'll need to move 1253 * this weirdness around to the expected layout. 1254 */ 1255 simd16 = true; 1256 orig_dst = dst; 1257 dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 1258 2)); 1259 dst.type = BRW_REGISTER_TYPE_F; 1260 } 1261 1262 fs_inst *inst = NULL; 1263 switch (ir->op) { 1264 case ir_tex: 1265 inst = emit(FS_OPCODE_TEX, dst); 1266 break; 1267 case ir_txb: 1268 inst = emit(FS_OPCODE_TXB, dst); 1269 break; 1270 case ir_txl: 1271 inst = emit(FS_OPCODE_TXL, dst); 1272 break; 1273 case ir_txd: 1274 inst = emit(FS_OPCODE_TXD, dst); 1275 break; 1276 case ir_txf: 1277 assert(!"GLSL 1.30 features unsupported"); 1278 break; 1279 } 1280 inst->base_mrf = base_mrf; 1281 inst->mlen = mlen; 1282 1283 if (simd16) { 1284 for (int i = 0; i < 4; i++) { 1285 emit(BRW_OPCODE_MOV, orig_dst, dst); 1286 orig_dst.reg_offset++; 1287 dst.reg_offset += 2; 1288 } 1289 } 1290 1291 return inst; 1292} 1293 1294/* gen5's sampler has slots for u, v, r, array index, then optional 1295 * parameters like shadow comparitor or LOD bias. If optional 1296 * parameters aren't present, those base slots are optional and don't 1297 * need to be included in the message. 1298 * 1299 * We don't fill in the unnecessary slots regardless, which may look 1300 * surprising in the disassembly. 1301 */ 1302fs_inst * 1303fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate) 1304{ 1305 int mlen = 1; /* g0 header always present. */ 1306 int base_mrf = 1; 1307 int reg_width = c->dispatch_width / 8; 1308 1309 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1310 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * reg_width), 1311 coordinate); 1312 coordinate.reg_offset++; 1313 } 1314 mlen += ir->coordinate->type->vector_elements * reg_width; 1315 1316 if (ir->shadow_comparitor) { 1317 mlen = MAX2(mlen, 1 + 4 * reg_width); 1318 1319 ir->shadow_comparitor->accept(this); 1320 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1321 mlen += reg_width; 1322 } 1323 1324 fs_inst *inst = NULL; 1325 switch (ir->op) { 1326 case ir_tex: 1327 inst = emit(FS_OPCODE_TEX, dst); 1328 break; 1329 case ir_txb: 1330 ir->lod_info.bias->accept(this); 1331 mlen = MAX2(mlen, 1 + 4 * reg_width); 1332 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1333 mlen += reg_width; 1334 1335 inst = emit(FS_OPCODE_TXB, dst); 1336 1337 break; 1338 case ir_txl: 1339 ir->lod_info.lod->accept(this); 1340 mlen = MAX2(mlen, 1 + 4 * reg_width); 1341 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1342 mlen += reg_width; 1343 1344 inst = emit(FS_OPCODE_TXL, dst); 1345 break; 1346 case ir_txd: 1347 case ir_txf: 1348 assert(!"GLSL 1.30 features unsupported"); 1349 break; 1350 } 1351 inst->base_mrf = base_mrf; 1352 inst->mlen = mlen; 1353 1354 if (mlen > 11) { 1355 fail("Message length >11 disallowed by hardware\n"); 1356 } 1357 1358 return inst; 1359} 1360 1361fs_inst * 1362fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate) 1363{ 1364 int mlen = 1; /* g0 header always present. */ 1365 int base_mrf = 1; 1366 int reg_width = c->dispatch_width / 8; 1367 1368 if (ir->shadow_comparitor) { 1369 ir->shadow_comparitor->accept(this); 1370 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1371 mlen += reg_width; 1372 } 1373 1374 /* Set up the LOD info */ 1375 switch (ir->op) { 1376 case ir_tex: 1377 break; 1378 case ir_txb: 1379 ir->lod_info.bias->accept(this); 1380 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1381 mlen += reg_width; 1382 break; 1383 case ir_txl: 1384 ir->lod_info.lod->accept(this); 1385 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result); 1386 mlen += reg_width; 1387 break; 1388 case ir_txd: 1389 case ir_txf: 1390 assert(!"GLSL 1.30 features unsupported"); 1391 break; 1392 } 1393 1394 /* Set up the coordinate */ 1395 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { 1396 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), 1397 coordinate); 1398 coordinate.reg_offset++; 1399 mlen += reg_width; 1400 } 1401 1402 /* Generate the SEND */ 1403 fs_inst *inst = NULL; 1404 switch (ir->op) { 1405 case ir_tex: inst = emit(FS_OPCODE_TEX, dst); break; 1406 case ir_txb: inst = emit(FS_OPCODE_TXB, dst); break; 1407 case ir_txl: inst = emit(FS_OPCODE_TXL, dst); break; 1408 case ir_txd: inst = emit(FS_OPCODE_TXD, dst); break; 1409 case ir_txf: assert(!"TXF unsupported."); 1410 } 1411 inst->base_mrf = base_mrf; 1412 inst->mlen = mlen; 1413 1414 if (mlen > 11) { 1415 fail("Message length >11 disallowed by hardware\n"); 1416 } 1417 1418 return inst; 1419} 1420 1421void 1422fs_visitor::visit(ir_texture *ir) 1423{ 1424 int sampler; 1425 fs_inst *inst = NULL; 1426 1427 ir->coordinate->accept(this); 1428 fs_reg coordinate = this->result; 1429 1430 if (ir->offset != NULL) { 1431 ir_constant *offset = ir->offset->as_constant(); 1432 assert(offset != NULL); 1433 1434 signed char offsets[3]; 1435 for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) 1436 offsets[i] = (signed char) offset->value.i[i]; 1437 1438 /* Combine all three offsets into a single unsigned dword: 1439 * 1440 * bits 11:8 - U Offset (X component) 1441 * bits 7:4 - V Offset (Y component) 1442 * bits 3:0 - R Offset (Z component) 1443 */ 1444 unsigned offset_bits = 0; 1445 for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) { 1446 const unsigned shift = 4 * (2 - i); 1447 offset_bits |= (offsets[i] << shift) & (0xF << shift); 1448 } 1449 1450 /* Explicitly set up the message header by copying g0 to msg reg m1. */ 1451 emit(BRW_OPCODE_MOV, fs_reg(MRF, 1, BRW_REGISTER_TYPE_UD), 1452 fs_reg(GRF, 0, BRW_REGISTER_TYPE_UD)); 1453 1454 /* Then set the offset bits in DWord 2 of the message header. */ 1455 emit(BRW_OPCODE_MOV, 1456 fs_reg(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 1, 2), 1457 BRW_REGISTER_TYPE_UD)), 1458 fs_reg(brw_imm_uw(offset_bits))); 1459 } 1460 1461 /* Should be lowered by do_lower_texture_projection */ 1462 assert(!ir->projector); 1463 1464 sampler = _mesa_get_sampler_uniform_value(ir->sampler, 1465 ctx->Shader.CurrentFragmentProgram, 1466 &brw->fragment_program->Base); 1467 sampler = c->fp->program.Base.SamplerUnits[sampler]; 1468 1469 /* The 965 requires the EU to do the normalization of GL rectangle 1470 * texture coordinates. We use the program parameter state 1471 * tracking to get the scaling factor. 1472 */ 1473 if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) { 1474 struct gl_program_parameter_list *params = c->fp->program.Base.Parameters; 1475 int tokens[STATE_LENGTH] = { 1476 STATE_INTERNAL, 1477 STATE_TEXRECT_SCALE, 1478 sampler, 1479 0, 1480 0 1481 }; 1482 1483 if (c->dispatch_width == 16) { 1484 fail("rectangle scale uniform setup not supported on 16-wide\n"); 1485 this->result = fs_reg(this, ir->type); 1486 return; 1487 } 1488 1489 c->prog_data.param_convert[c->prog_data.nr_params] = 1490 PARAM_NO_CONVERT; 1491 c->prog_data.param_convert[c->prog_data.nr_params + 1] = 1492 PARAM_NO_CONVERT; 1493 1494 fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params); 1495 fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1); 1496 GLuint index = _mesa_add_state_reference(params, 1497 (gl_state_index *)tokens); 1498 1499 this->param_index[c->prog_data.nr_params] = index; 1500 this->param_offset[c->prog_data.nr_params] = 0; 1501 c->prog_data.nr_params++; 1502 this->param_index[c->prog_data.nr_params] = index; 1503 this->param_offset[c->prog_data.nr_params] = 1; 1504 c->prog_data.nr_params++; 1505 1506 fs_reg dst = fs_reg(this, ir->coordinate->type); 1507 fs_reg src = coordinate; 1508 coordinate = dst; 1509 1510 emit(BRW_OPCODE_MUL, dst, src, scale_x); 1511 dst.reg_offset++; 1512 src.reg_offset++; 1513 emit(BRW_OPCODE_MUL, dst, src, scale_y); 1514 } 1515 1516 /* Writemasking doesn't eliminate channels on SIMD8 texture 1517 * samples, so don't worry about them. 1518 */ 1519 fs_reg dst = fs_reg(this, glsl_type::vec4_type); 1520 1521 if (intel->gen >= 7) { 1522 inst = emit_texture_gen7(ir, dst, coordinate); 1523 } else if (intel->gen >= 5) { 1524 inst = emit_texture_gen5(ir, dst, coordinate); 1525 } else { 1526 inst = emit_texture_gen4(ir, dst, coordinate); 1527 } 1528 1529 /* If there's an offset, we already set up m1. To avoid the implied move, 1530 * use the null register. Otherwise, we want an implied move from g0. 1531 */ 1532 if (ir->offset != NULL) 1533 inst->src[0] = fs_reg(brw_null_reg()); 1534 else 1535 inst->src[0] = fs_reg(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW)); 1536 1537 inst->sampler = sampler; 1538 1539 this->result = dst; 1540 1541 if (ir->shadow_comparitor) 1542 inst->shadow_compare = true; 1543 1544 if (ir->type == glsl_type::float_type) { 1545 /* Ignore DEPTH_TEXTURE_MODE swizzling. */ 1546 assert(ir->sampler->type->sampler_shadow); 1547 } else if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) { 1548 fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type); 1549 1550 for (int i = 0; i < 4; i++) { 1551 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i); 1552 fs_reg l = swizzle_dst; 1553 l.reg_offset += i; 1554 1555 if (swiz == SWIZZLE_ZERO) { 1556 emit(BRW_OPCODE_MOV, l, fs_reg(0.0f)); 1557 } else if (swiz == SWIZZLE_ONE) { 1558 emit(BRW_OPCODE_MOV, l, fs_reg(1.0f)); 1559 } else { 1560 fs_reg r = dst; 1561 r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i); 1562 emit(BRW_OPCODE_MOV, l, r); 1563 } 1564 } 1565 this->result = swizzle_dst; 1566 } 1567} 1568 1569void 1570fs_visitor::visit(ir_swizzle *ir) 1571{ 1572 ir->val->accept(this); 1573 fs_reg val = this->result; 1574 1575 if (ir->type->vector_elements == 1) { 1576 this->result.reg_offset += ir->mask.x; 1577 return; 1578 } 1579 1580 fs_reg result = fs_reg(this, ir->type); 1581 this->result = result; 1582 1583 for (unsigned int i = 0; i < ir->type->vector_elements; i++) { 1584 fs_reg channel = val; 1585 int swiz = 0; 1586 1587 switch (i) { 1588 case 0: 1589 swiz = ir->mask.x; 1590 break; 1591 case 1: 1592 swiz = ir->mask.y; 1593 break; 1594 case 2: 1595 swiz = ir->mask.z; 1596 break; 1597 case 3: 1598 swiz = ir->mask.w; 1599 break; 1600 } 1601 1602 channel.reg_offset += swiz; 1603 emit(BRW_OPCODE_MOV, result, channel); 1604 result.reg_offset++; 1605 } 1606} 1607 1608void 1609fs_visitor::visit(ir_discard *ir) 1610{ 1611 fs_reg temp = fs_reg(this, glsl_type::uint_type); 1612 1613 assert(ir->condition == NULL); /* FINISHME */ 1614 1615 emit(FS_OPCODE_DISCARD_NOT, temp, reg_null_d); 1616 emit(FS_OPCODE_DISCARD_AND, reg_null_d, temp); 1617 kill_emitted = true; 1618} 1619 1620void 1621fs_visitor::visit(ir_constant *ir) 1622{ 1623 /* Set this->result to reg at the bottom of the function because some code 1624 * paths will cause this visitor to be applied to other fields. This will 1625 * cause the value stored in this->result to be modified. 1626 * 1627 * Make reg constant so that it doesn't get accidentally modified along the 1628 * way. Yes, I actually had this problem. :( 1629 */ 1630 const fs_reg reg(this, ir->type); 1631 fs_reg dst_reg = reg; 1632 1633 if (ir->type->is_array()) { 1634 const unsigned size = type_size(ir->type->fields.array); 1635 1636 for (unsigned i = 0; i < ir->type->length; i++) { 1637 ir->array_elements[i]->accept(this); 1638 fs_reg src_reg = this->result; 1639 1640 dst_reg.type = src_reg.type; 1641 for (unsigned j = 0; j < size; j++) { 1642 emit(BRW_OPCODE_MOV, dst_reg, src_reg); 1643 src_reg.reg_offset++; 1644 dst_reg.reg_offset++; 1645 } 1646 } 1647 } else if (ir->type->is_record()) { 1648 foreach_list(node, &ir->components) { 1649 ir_instruction *const field = (ir_instruction *) node; 1650 const unsigned size = type_size(field->type); 1651 1652 field->accept(this); 1653 fs_reg src_reg = this->result; 1654 1655 dst_reg.type = src_reg.type; 1656 for (unsigned j = 0; j < size; j++) { 1657 emit(BRW_OPCODE_MOV, dst_reg, src_reg); 1658 src_reg.reg_offset++; 1659 dst_reg.reg_offset++; 1660 } 1661 } 1662 } else { 1663 const unsigned size = type_size(ir->type); 1664 1665 for (unsigned i = 0; i < size; i++) { 1666 switch (ir->type->base_type) { 1667 case GLSL_TYPE_FLOAT: 1668 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i])); 1669 break; 1670 case GLSL_TYPE_UINT: 1671 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i])); 1672 break; 1673 case GLSL_TYPE_INT: 1674 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i])); 1675 break; 1676 case GLSL_TYPE_BOOL: 1677 emit(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i])); 1678 break; 1679 default: 1680 assert(!"Non-float/uint/int/bool constant"); 1681 } 1682 dst_reg.reg_offset++; 1683 } 1684 } 1685 1686 this->result = reg; 1687} 1688 1689void 1690fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir) 1691{ 1692 ir_expression *expr = ir->as_expression(); 1693 1694 if (expr) { 1695 fs_reg op[2]; 1696 fs_inst *inst; 1697 1698 assert(expr->get_num_operands() <= 2); 1699 for (unsigned int i = 0; i < expr->get_num_operands(); i++) { 1700 assert(expr->operands[i]->type->is_scalar()); 1701 1702 expr->operands[i]->accept(this); 1703 op[i] = this->result; 1704 } 1705 1706 switch (expr->operation) { 1707 case ir_unop_logic_not: 1708 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1)); 1709 inst->conditional_mod = BRW_CONDITIONAL_Z; 1710 break; 1711 1712 case ir_binop_logic_xor: 1713 inst = emit(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]); 1714 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1715 break; 1716 1717 case ir_binop_logic_or: 1718 inst = emit(BRW_OPCODE_OR, reg_null_d, op[0], op[1]); 1719 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1720 break; 1721 1722 case ir_binop_logic_and: 1723 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], op[1]); 1724 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1725 break; 1726 1727 case ir_unop_f2b: 1728 if (intel->gen >= 6) { 1729 inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f)); 1730 } else { 1731 inst = emit(BRW_OPCODE_MOV, reg_null_f, op[0]); 1732 } 1733 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1734 break; 1735 1736 case ir_unop_i2b: 1737 if (intel->gen >= 6) { 1738 inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0)); 1739 } else { 1740 inst = emit(BRW_OPCODE_MOV, reg_null_d, op[0]); 1741 } 1742 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1743 break; 1744 1745 case ir_binop_greater: 1746 case ir_binop_gequal: 1747 case ir_binop_less: 1748 case ir_binop_lequal: 1749 case ir_binop_equal: 1750 case ir_binop_all_equal: 1751 case ir_binop_nequal: 1752 case ir_binop_any_nequal: 1753 inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]); 1754 inst->conditional_mod = 1755 brw_conditional_for_comparison(expr->operation); 1756 break; 1757 1758 default: 1759 assert(!"not reached"); 1760 fail("bad cond code\n"); 1761 break; 1762 } 1763 return; 1764 } 1765 1766 ir->accept(this); 1767 1768 if (intel->gen >= 6) { 1769 fs_inst *inst = emit(BRW_OPCODE_AND, reg_null_d, this->result, fs_reg(1)); 1770 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1771 } else { 1772 fs_inst *inst = emit(BRW_OPCODE_MOV, reg_null_d, this->result); 1773 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1774 } 1775} 1776 1777/** 1778 * Emit a gen6 IF statement with the comparison folded into the IF 1779 * instruction. 1780 */ 1781void 1782fs_visitor::emit_if_gen6(ir_if *ir) 1783{ 1784 ir_expression *expr = ir->condition->as_expression(); 1785 1786 if (expr) { 1787 fs_reg op[2]; 1788 fs_inst *inst; 1789 fs_reg temp; 1790 1791 assert(expr->get_num_operands() <= 2); 1792 for (unsigned int i = 0; i < expr->get_num_operands(); i++) { 1793 assert(expr->operands[i]->type->is_scalar()); 1794 1795 expr->operands[i]->accept(this); 1796 op[i] = this->result; 1797 } 1798 1799 switch (expr->operation) { 1800 case ir_unop_logic_not: 1801 inst = emit(BRW_OPCODE_IF, temp, op[0], fs_reg(0)); 1802 inst->conditional_mod = BRW_CONDITIONAL_Z; 1803 return; 1804 1805 case ir_binop_logic_xor: 1806 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]); 1807 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1808 return; 1809 1810 case ir_binop_logic_or: 1811 temp = fs_reg(this, glsl_type::bool_type); 1812 emit(BRW_OPCODE_OR, temp, op[0], op[1]); 1813 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0)); 1814 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1815 return; 1816 1817 case ir_binop_logic_and: 1818 temp = fs_reg(this, glsl_type::bool_type); 1819 emit(BRW_OPCODE_AND, temp, op[0], op[1]); 1820 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0)); 1821 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1822 return; 1823 1824 case ir_unop_f2b: 1825 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0)); 1826 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1827 return; 1828 1829 case ir_unop_i2b: 1830 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0)); 1831 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1832 return; 1833 1834 case ir_binop_greater: 1835 case ir_binop_gequal: 1836 case ir_binop_less: 1837 case ir_binop_lequal: 1838 case ir_binop_equal: 1839 case ir_binop_all_equal: 1840 case ir_binop_nequal: 1841 case ir_binop_any_nequal: 1842 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]); 1843 inst->conditional_mod = 1844 brw_conditional_for_comparison(expr->operation); 1845 return; 1846 default: 1847 assert(!"not reached"); 1848 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0)); 1849 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1850 fail("bad condition\n"); 1851 return; 1852 } 1853 return; 1854 } 1855 1856 ir->condition->accept(this); 1857 1858 fs_inst *inst = emit(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0)); 1859 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1860} 1861 1862void 1863fs_visitor::visit(ir_if *ir) 1864{ 1865 fs_inst *inst; 1866 1867 if (c->dispatch_width == 16) { 1868 fail("Can't support (non-uniform) control flow on 16-wide\n"); 1869 } 1870 1871 /* Don't point the annotation at the if statement, because then it plus 1872 * the then and else blocks get printed. 1873 */ 1874 this->base_ir = ir->condition; 1875 1876 if (intel->gen >= 6) { 1877 emit_if_gen6(ir); 1878 } else { 1879 emit_bool_to_cond_code(ir->condition); 1880 1881 inst = emit(BRW_OPCODE_IF); 1882 inst->predicated = true; 1883 } 1884 1885 foreach_iter(exec_list_iterator, iter, ir->then_instructions) { 1886 ir_instruction *ir = (ir_instruction *)iter.get(); 1887 this->base_ir = ir; 1888 1889 ir->accept(this); 1890 } 1891 1892 if (!ir->else_instructions.is_empty()) { 1893 emit(BRW_OPCODE_ELSE); 1894 1895 foreach_iter(exec_list_iterator, iter, ir->else_instructions) { 1896 ir_instruction *ir = (ir_instruction *)iter.get(); 1897 this->base_ir = ir; 1898 1899 ir->accept(this); 1900 } 1901 } 1902 1903 emit(BRW_OPCODE_ENDIF); 1904} 1905 1906void 1907fs_visitor::visit(ir_loop *ir) 1908{ 1909 fs_reg counter = reg_undef; 1910 1911 if (c->dispatch_width == 16) { 1912 fail("Can't support (non-uniform) control flow on 16-wide\n"); 1913 } 1914 1915 if (ir->counter) { 1916 this->base_ir = ir->counter; 1917 ir->counter->accept(this); 1918 counter = *(variable_storage(ir->counter)); 1919 1920 if (ir->from) { 1921 this->base_ir = ir->from; 1922 ir->from->accept(this); 1923 1924 emit(BRW_OPCODE_MOV, counter, this->result); 1925 } 1926 } 1927 1928 emit(BRW_OPCODE_DO); 1929 1930 if (ir->to) { 1931 this->base_ir = ir->to; 1932 ir->to->accept(this); 1933 1934 fs_inst *inst = emit(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result); 1935 inst->conditional_mod = brw_conditional_for_comparison(ir->cmp); 1936 1937 inst = emit(BRW_OPCODE_BREAK); 1938 inst->predicated = true; 1939 } 1940 1941 foreach_iter(exec_list_iterator, iter, ir->body_instructions) { 1942 ir_instruction *ir = (ir_instruction *)iter.get(); 1943 1944 this->base_ir = ir; 1945 ir->accept(this); 1946 } 1947 1948 if (ir->increment) { 1949 this->base_ir = ir->increment; 1950 ir->increment->accept(this); 1951 emit(BRW_OPCODE_ADD, counter, counter, this->result); 1952 } 1953 1954 emit(BRW_OPCODE_WHILE); 1955} 1956 1957void 1958fs_visitor::visit(ir_loop_jump *ir) 1959{ 1960 switch (ir->mode) { 1961 case ir_loop_jump::jump_break: 1962 emit(BRW_OPCODE_BREAK); 1963 break; 1964 case ir_loop_jump::jump_continue: 1965 emit(BRW_OPCODE_CONTINUE); 1966 break; 1967 } 1968} 1969 1970void 1971fs_visitor::visit(ir_call *ir) 1972{ 1973 assert(!"FINISHME"); 1974} 1975 1976void 1977fs_visitor::visit(ir_return *ir) 1978{ 1979 assert(!"FINISHME"); 1980} 1981 1982void 1983fs_visitor::visit(ir_function *ir) 1984{ 1985 /* Ignore function bodies other than main() -- we shouldn't see calls to 1986 * them since they should all be inlined before we get to ir_to_mesa. 1987 */ 1988 if (strcmp(ir->name, "main") == 0) { 1989 const ir_function_signature *sig; 1990 exec_list empty; 1991 1992 sig = ir->matching_signature(&empty); 1993 1994 assert(sig); 1995 1996 foreach_iter(exec_list_iterator, iter, sig->body) { 1997 ir_instruction *ir = (ir_instruction *)iter.get(); 1998 this->base_ir = ir; 1999 2000 ir->accept(this); 2001 } 2002 } 2003} 2004 2005void 2006fs_visitor::visit(ir_function_signature *ir) 2007{ 2008 assert(!"not reached"); 2009 (void)ir; 2010} 2011 2012fs_inst * 2013fs_visitor::emit(fs_inst inst) 2014{ 2015 fs_inst *list_inst = new(mem_ctx) fs_inst; 2016 *list_inst = inst; 2017 2018 if (force_uncompressed_stack > 0) 2019 list_inst->force_uncompressed = true; 2020 else if (force_sechalf_stack > 0) 2021 list_inst->force_sechalf = true; 2022 2023 list_inst->annotation = this->current_annotation; 2024 list_inst->ir = this->base_ir; 2025 2026 this->instructions.push_tail(list_inst); 2027 2028 return list_inst; 2029} 2030 2031/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */ 2032void 2033fs_visitor::emit_dummy_fs() 2034{ 2035 /* Everyone's favorite color. */ 2036 emit(BRW_OPCODE_MOV, fs_reg(MRF, 2), fs_reg(1.0f)); 2037 emit(BRW_OPCODE_MOV, fs_reg(MRF, 3), fs_reg(0.0f)); 2038 emit(BRW_OPCODE_MOV, fs_reg(MRF, 4), fs_reg(1.0f)); 2039 emit(BRW_OPCODE_MOV, fs_reg(MRF, 5), fs_reg(0.0f)); 2040 2041 fs_inst *write; 2042 write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0)); 2043 write->base_mrf = 0; 2044} 2045 2046/* The register location here is relative to the start of the URB 2047 * data. It will get adjusted to be a real location before 2048 * generate_code() time. 2049 */ 2050struct brw_reg 2051fs_visitor::interp_reg(int location, int channel) 2052{ 2053 int regnr = urb_setup[location] * 2 + channel / 2; 2054 int stride = (channel & 1) * 4; 2055 2056 assert(urb_setup[location] != -1); 2057 2058 return brw_vec1_grf(regnr, stride); 2059} 2060 2061/** Emits the interpolation for the varying inputs. */ 2062void 2063fs_visitor::emit_interpolation_setup_gen4() 2064{ 2065 this->current_annotation = "compute pixel centers"; 2066 this->pixel_x = fs_reg(this, glsl_type::uint_type); 2067 this->pixel_y = fs_reg(this, glsl_type::uint_type); 2068 this->pixel_x.type = BRW_REGISTER_TYPE_UW; 2069 this->pixel_y.type = BRW_REGISTER_TYPE_UW; 2070 2071 emit(FS_OPCODE_PIXEL_X, this->pixel_x); 2072 emit(FS_OPCODE_PIXEL_Y, this->pixel_y); 2073 2074 this->current_annotation = "compute pixel deltas from v0"; 2075 if (brw->has_pln) { 2076 this->delta_x = fs_reg(this, glsl_type::vec2_type); 2077 this->delta_y = this->delta_x; 2078 this->delta_y.reg_offset++; 2079 } else { 2080 this->delta_x = fs_reg(this, glsl_type::float_type); 2081 this->delta_y = fs_reg(this, glsl_type::float_type); 2082 } 2083 emit(BRW_OPCODE_ADD, this->delta_x, 2084 this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0)))); 2085 emit(BRW_OPCODE_ADD, this->delta_y, 2086 this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1)))); 2087 2088 this->current_annotation = "compute pos.w and 1/pos.w"; 2089 /* Compute wpos.w. It's always in our setup, since it's needed to 2090 * interpolate the other attributes. 2091 */ 2092 this->wpos_w = fs_reg(this, glsl_type::float_type); 2093 emit(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y, 2094 interp_reg(FRAG_ATTRIB_WPOS, 3)); 2095 /* Compute the pixel 1/W value from wpos.w. */ 2096 this->pixel_w = fs_reg(this, glsl_type::float_type); 2097 emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w); 2098 this->current_annotation = NULL; 2099} 2100 2101/** Emits the interpolation for the varying inputs. */ 2102void 2103fs_visitor::emit_interpolation_setup_gen6() 2104{ 2105 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW); 2106 2107 /* If the pixel centers end up used, the setup is the same as for gen4. */ 2108 this->current_annotation = "compute pixel centers"; 2109 fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type); 2110 fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type); 2111 int_pixel_x.type = BRW_REGISTER_TYPE_UW; 2112 int_pixel_y.type = BRW_REGISTER_TYPE_UW; 2113 emit(BRW_OPCODE_ADD, 2114 int_pixel_x, 2115 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)), 2116 fs_reg(brw_imm_v(0x10101010))); 2117 emit(BRW_OPCODE_ADD, 2118 int_pixel_y, 2119 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)), 2120 fs_reg(brw_imm_v(0x11001100))); 2121 2122 /* As of gen6, we can no longer mix float and int sources. We have 2123 * to turn the integer pixel centers into floats for their actual 2124 * use. 2125 */ 2126 this->pixel_x = fs_reg(this, glsl_type::float_type); 2127 this->pixel_y = fs_reg(this, glsl_type::float_type); 2128 emit(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x); 2129 emit(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y); 2130 2131 this->current_annotation = "compute pos.w"; 2132 this->pixel_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0)); 2133 this->wpos_w = fs_reg(this, glsl_type::float_type); 2134 emit_math(FS_OPCODE_RCP, this->wpos_w, this->pixel_w); 2135 2136 this->delta_x = fs_reg(brw_vec8_grf(2, 0)); 2137 this->delta_y = fs_reg(brw_vec8_grf(3, 0)); 2138 2139 this->current_annotation = NULL; 2140} 2141 2142void 2143fs_visitor::emit_color_write(int index, int first_color_mrf, fs_reg color) 2144{ 2145 int reg_width = c->dispatch_width / 8; 2146 2147 if (c->dispatch_width == 8 || intel->gen == 6) { 2148 /* SIMD8 write looks like: 2149 * m + 0: r0 2150 * m + 1: r1 2151 * m + 2: g0 2152 * m + 3: g1 2153 * 2154 * gen6 SIMD16 DP write looks like: 2155 * m + 0: r0 2156 * m + 1: r1 2157 * m + 2: g0 2158 * m + 3: g1 2159 * m + 4: b0 2160 * m + 5: b1 2161 * m + 6: a0 2162 * m + 7: a1 2163 */ 2164 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index * reg_width), 2165 color); 2166 } else { 2167 /* pre-gen6 SIMD16 single source DP write looks like: 2168 * m + 0: r0 2169 * m + 1: g0 2170 * m + 2: b0 2171 * m + 3: a0 2172 * m + 4: r1 2173 * m + 5: g1 2174 * m + 6: b1 2175 * m + 7: a1 2176 */ 2177 if (brw->has_compr4) { 2178 /* By setting the high bit of the MRF register number, we 2179 * indicate that we want COMPR4 mode - instead of doing the 2180 * usual destination + 1 for the second half we get 2181 * destination + 4. 2182 */ 2183 emit(BRW_OPCODE_MOV, 2184 fs_reg(MRF, BRW_MRF_COMPR4 + first_color_mrf + index), color); 2185 } else { 2186 push_force_uncompressed(); 2187 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index), color); 2188 pop_force_uncompressed(); 2189 2190 push_force_sechalf(); 2191 color.sechalf = true; 2192 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index + 4), color); 2193 pop_force_sechalf(); 2194 color.sechalf = false; 2195 } 2196 } 2197} 2198 2199void 2200fs_visitor::emit_fb_writes() 2201{ 2202 this->current_annotation = "FB write header"; 2203 GLboolean header_present = GL_TRUE; 2204 int nr = 0; 2205 int reg_width = c->dispatch_width / 8; 2206 2207 if (intel->gen >= 6 && 2208 !this->kill_emitted && 2209 c->key.nr_color_regions == 1) { 2210 header_present = false; 2211 } 2212 2213 if (header_present) { 2214 /* m0, m1 header */ 2215 nr += 2; 2216 } 2217 2218 if (c->aa_dest_stencil_reg) { 2219 push_force_uncompressed(); 2220 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), 2221 fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0))); 2222 pop_force_uncompressed(); 2223 } 2224 2225 /* Reserve space for color. It'll be filled in per MRT below. */ 2226 int color_mrf = nr; 2227 nr += 4 * reg_width; 2228 2229 if (c->source_depth_to_render_target) { 2230 if (intel->gen == 6 && c->dispatch_width == 16) { 2231 /* For outputting oDepth on gen6, SIMD8 writes have to be 2232 * used. This would require 8-wide moves of each half to 2233 * message regs, kind of like pre-gen5 SIMD16 FB writes. 2234 * Just bail on doing so for now. 2235 */ 2236 fail("Missing support for simd16 depth writes on gen6\n"); 2237 } 2238 2239 if (c->computes_depth) { 2240 /* Hand over gl_FragDepth. */ 2241 assert(this->frag_depth); 2242 fs_reg depth = *(variable_storage(this->frag_depth)); 2243 2244 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), depth); 2245 } else { 2246 /* Pass through the payload depth. */ 2247 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), 2248 fs_reg(brw_vec8_grf(c->source_depth_reg, 0))); 2249 } 2250 nr += reg_width; 2251 } 2252 2253 if (c->dest_depth_reg) { 2254 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), 2255 fs_reg(brw_vec8_grf(c->dest_depth_reg, 0))); 2256 nr += reg_width; 2257 } 2258 2259 fs_reg color = reg_undef; 2260 if (this->frag_color) 2261 color = *(variable_storage(this->frag_color)); 2262 else if (this->frag_data) { 2263 color = *(variable_storage(this->frag_data)); 2264 color.type = BRW_REGISTER_TYPE_F; 2265 } 2266 2267 for (int target = 0; target < c->key.nr_color_regions; target++) { 2268 this->current_annotation = ralloc_asprintf(this->mem_ctx, 2269 "FB write target %d", 2270 target); 2271 if (this->frag_color || this->frag_data) { 2272 for (int i = 0; i < 4; i++) { 2273 emit_color_write(i, color_mrf, color); 2274 color.reg_offset++; 2275 } 2276 } 2277 2278 if (this->frag_color) 2279 color.reg_offset -= 4; 2280 2281 fs_inst *inst = emit(FS_OPCODE_FB_WRITE); 2282 inst->target = target; 2283 inst->base_mrf = 0; 2284 inst->mlen = nr; 2285 if (target == c->key.nr_color_regions - 1) 2286 inst->eot = true; 2287 inst->header_present = header_present; 2288 } 2289 2290 if (c->key.nr_color_regions == 0) { 2291 if (c->key.alpha_test && (this->frag_color || this->frag_data)) { 2292 /* If the alpha test is enabled but there's no color buffer, 2293 * we still need to send alpha out the pipeline to our null 2294 * renderbuffer. 2295 */ 2296 color.reg_offset += 3; 2297 emit_color_write(3, color_mrf, color); 2298 } 2299 2300 fs_inst *inst = emit(FS_OPCODE_FB_WRITE); 2301 inst->base_mrf = 0; 2302 inst->mlen = nr; 2303 inst->eot = true; 2304 inst->header_present = header_present; 2305 } 2306 2307 this->current_annotation = NULL; 2308} 2309 2310void 2311fs_visitor::generate_fb_write(fs_inst *inst) 2312{ 2313 GLboolean eot = inst->eot; 2314 struct brw_reg implied_header; 2315 2316 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied 2317 * move, here's g1. 2318 */ 2319 brw_push_insn_state(p); 2320 brw_set_mask_control(p, BRW_MASK_DISABLE); 2321 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2322 2323 if (inst->header_present) { 2324 if (intel->gen >= 6) { 2325 brw_MOV(p, 2326 brw_message_reg(inst->base_mrf), 2327 brw_vec8_grf(0, 0)); 2328 2329 if (inst->target > 0) { 2330 /* Set the render target index for choosing BLEND_STATE. */ 2331 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2), 2332 BRW_REGISTER_TYPE_UD), 2333 brw_imm_ud(inst->target)); 2334 } 2335 2336 /* Clear viewport index, render target array index. */ 2337 brw_AND(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 0), 2338 BRW_REGISTER_TYPE_UD), 2339 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), 2340 brw_imm_ud(0xf7ff)); 2341 2342 implied_header = brw_null_reg(); 2343 } else { 2344 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); 2345 } 2346 2347 brw_MOV(p, 2348 brw_message_reg(inst->base_mrf + 1), 2349 brw_vec8_grf(1, 0)); 2350 } else { 2351 implied_header = brw_null_reg(); 2352 } 2353 2354 brw_pop_insn_state(p); 2355 2356 brw_fb_WRITE(p, 2357 c->dispatch_width, 2358 inst->base_mrf, 2359 implied_header, 2360 inst->target, 2361 inst->mlen, 2362 0, 2363 eot, 2364 inst->header_present); 2365} 2366 2367/* Computes the integer pixel x,y values from the origin. 2368 * 2369 * This is the basis of gl_FragCoord computation, but is also used 2370 * pre-gen6 for computing the deltas from v0 for computing 2371 * interpolation. 2372 */ 2373void 2374fs_visitor::generate_pixel_xy(struct brw_reg dst, bool is_x) 2375{ 2376 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW); 2377 struct brw_reg src; 2378 struct brw_reg deltas; 2379 2380 if (is_x) { 2381 src = stride(suboffset(g1_uw, 4), 2, 4, 0); 2382 deltas = brw_imm_v(0x10101010); 2383 } else { 2384 src = stride(suboffset(g1_uw, 5), 2, 4, 0); 2385 deltas = brw_imm_v(0x11001100); 2386 } 2387 2388 if (c->dispatch_width == 16) { 2389 dst = vec16(dst); 2390 } 2391 2392 /* We do this 8 or 16-wide, but since the destination is UW we 2393 * don't do compression in the 16-wide case. 2394 */ 2395 brw_push_insn_state(p); 2396 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2397 brw_ADD(p, dst, src, deltas); 2398 brw_pop_insn_state(p); 2399} 2400 2401void 2402fs_visitor::generate_linterp(fs_inst *inst, 2403 struct brw_reg dst, struct brw_reg *src) 2404{ 2405 struct brw_reg delta_x = src[0]; 2406 struct brw_reg delta_y = src[1]; 2407 struct brw_reg interp = src[2]; 2408 2409 if (brw->has_pln && 2410 delta_y.nr == delta_x.nr + 1 && 2411 (intel->gen >= 6 || (delta_x.nr & 1) == 0)) { 2412 brw_PLN(p, dst, interp, delta_x); 2413 } else { 2414 brw_LINE(p, brw_null_reg(), interp, delta_x); 2415 brw_MAC(p, dst, suboffset(interp, 1), delta_y); 2416 } 2417} 2418 2419void 2420fs_visitor::generate_math(fs_inst *inst, 2421 struct brw_reg dst, struct brw_reg *src) 2422{ 2423 int op; 2424 2425 switch (inst->opcode) { 2426 case FS_OPCODE_RCP: 2427 op = BRW_MATH_FUNCTION_INV; 2428 break; 2429 case FS_OPCODE_RSQ: 2430 op = BRW_MATH_FUNCTION_RSQ; 2431 break; 2432 case FS_OPCODE_SQRT: 2433 op = BRW_MATH_FUNCTION_SQRT; 2434 break; 2435 case FS_OPCODE_EXP2: 2436 op = BRW_MATH_FUNCTION_EXP; 2437 break; 2438 case FS_OPCODE_LOG2: 2439 op = BRW_MATH_FUNCTION_LOG; 2440 break; 2441 case FS_OPCODE_POW: 2442 op = BRW_MATH_FUNCTION_POW; 2443 break; 2444 case FS_OPCODE_SIN: 2445 op = BRW_MATH_FUNCTION_SIN; 2446 break; 2447 case FS_OPCODE_COS: 2448 op = BRW_MATH_FUNCTION_COS; 2449 break; 2450 default: 2451 assert(!"not reached: unknown math function"); 2452 op = 0; 2453 break; 2454 } 2455 2456 if (intel->gen >= 6) { 2457 assert(inst->mlen == 0); 2458 2459 if (inst->opcode == FS_OPCODE_POW) { 2460 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2461 brw_math2(p, dst, op, src[0], src[1]); 2462 2463 if (c->dispatch_width == 16) { 2464 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); 2465 brw_math2(p, sechalf(dst), op, sechalf(src[0]), sechalf(src[1])); 2466 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 2467 } 2468 } else { 2469 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2470 brw_math(p, dst, 2471 op, 2472 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 2473 BRW_MATH_SATURATE_NONE, 2474 0, src[0], 2475 BRW_MATH_DATA_VECTOR, 2476 BRW_MATH_PRECISION_FULL); 2477 2478 if (c->dispatch_width == 16) { 2479 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); 2480 brw_math(p, sechalf(dst), 2481 op, 2482 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 2483 BRW_MATH_SATURATE_NONE, 2484 0, sechalf(src[0]), 2485 BRW_MATH_DATA_VECTOR, 2486 BRW_MATH_PRECISION_FULL); 2487 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 2488 } 2489 } 2490 } else /* gen <= 5 */{ 2491 assert(inst->mlen >= 1); 2492 2493 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2494 brw_math(p, dst, 2495 op, 2496 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 2497 BRW_MATH_SATURATE_NONE, 2498 inst->base_mrf, src[0], 2499 BRW_MATH_DATA_VECTOR, 2500 BRW_MATH_PRECISION_FULL); 2501 2502 if (c->dispatch_width == 16) { 2503 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); 2504 brw_math(p, sechalf(dst), 2505 op, 2506 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 2507 BRW_MATH_SATURATE_NONE, 2508 inst->base_mrf + 1, sechalf(src[0]), 2509 BRW_MATH_DATA_VECTOR, 2510 BRW_MATH_PRECISION_FULL); 2511 2512 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 2513 } 2514 } 2515} 2516 2517void 2518fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 2519{ 2520 int msg_type = -1; 2521 int rlen = 4; 2522 uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; 2523 2524 if (c->dispatch_width == 16) { 2525 rlen = 8; 2526 dst = vec16(dst); 2527 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 2528 } 2529 2530 if (intel->gen >= 5) { 2531 switch (inst->opcode) { 2532 case FS_OPCODE_TEX: 2533 if (inst->shadow_compare) { 2534 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE; 2535 } else { 2536 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE; 2537 } 2538 break; 2539 case FS_OPCODE_TXB: 2540 if (inst->shadow_compare) { 2541 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE; 2542 } else { 2543 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS; 2544 } 2545 break; 2546 case FS_OPCODE_TXL: 2547 if (inst->shadow_compare) { 2548 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE; 2549 } else { 2550 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD; 2551 } 2552 break; 2553 case FS_OPCODE_TXD: 2554 assert(!"TXD isn't supported on gen5+ yet."); 2555 break; 2556 } 2557 } else { 2558 switch (inst->opcode) { 2559 case FS_OPCODE_TEX: 2560 /* Note that G45 and older determines shadow compare and dispatch width 2561 * from message length for most messages. 2562 */ 2563 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE; 2564 if (inst->shadow_compare) { 2565 assert(inst->mlen == 6); 2566 } else { 2567 assert(inst->mlen <= 4); 2568 } 2569 break; 2570 case FS_OPCODE_TXB: 2571 if (inst->shadow_compare) { 2572 assert(inst->mlen == 6); 2573 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE; 2574 } else { 2575 assert(inst->mlen == 9); 2576 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS; 2577 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 2578 } 2579 break; 2580 case FS_OPCODE_TXL: 2581 if (inst->shadow_compare) { 2582 assert(inst->mlen == 6); 2583 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE; 2584 } else { 2585 assert(inst->mlen == 9); 2586 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD; 2587 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 2588 } 2589 break; 2590 case FS_OPCODE_TXD: 2591 assert(!"TXD isn't supported on gen4 yet."); 2592 break; 2593 } 2594 } 2595 assert(msg_type != -1); 2596 2597 brw_SAMPLE(p, 2598 retype(dst, BRW_REGISTER_TYPE_UW), 2599 inst->base_mrf, 2600 src, 2601 SURF_INDEX_TEXTURE(inst->sampler), 2602 inst->sampler, 2603 WRITEMASK_XYZW, 2604 msg_type, 2605 rlen, 2606 inst->mlen, 2607 0, 2608 1, 2609 simd_mode); 2610} 2611 2612 2613/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input 2614 * looking like: 2615 * 2616 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br 2617 * 2618 * and we're trying to produce: 2619 * 2620 * DDX DDY 2621 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl) 2622 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br) 2623 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl) 2624 * (ss0.br - ss0.bl) (ss0.tr - ss0.br) 2625 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl) 2626 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br) 2627 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl) 2628 * (ss1.br - ss1.bl) (ss1.tr - ss1.br) 2629 * 2630 * and add another set of two more subspans if in 16-pixel dispatch mode. 2631 * 2632 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result 2633 * for each pair, and vertstride = 2 jumps us 2 elements after processing a 2634 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled 2635 * between each other. We could probably do it like ddx and swizzle the right 2636 * order later, but bail for now and just produce 2637 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4) 2638 */ 2639void 2640fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 2641{ 2642 struct brw_reg src0 = brw_reg(src.file, src.nr, 1, 2643 BRW_REGISTER_TYPE_F, 2644 BRW_VERTICAL_STRIDE_2, 2645 BRW_WIDTH_2, 2646 BRW_HORIZONTAL_STRIDE_0, 2647 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2648 struct brw_reg src1 = brw_reg(src.file, src.nr, 0, 2649 BRW_REGISTER_TYPE_F, 2650 BRW_VERTICAL_STRIDE_2, 2651 BRW_WIDTH_2, 2652 BRW_HORIZONTAL_STRIDE_0, 2653 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2654 brw_ADD(p, dst, src0, negate(src1)); 2655} 2656 2657void 2658fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 2659{ 2660 struct brw_reg src0 = brw_reg(src.file, src.nr, 0, 2661 BRW_REGISTER_TYPE_F, 2662 BRW_VERTICAL_STRIDE_4, 2663 BRW_WIDTH_4, 2664 BRW_HORIZONTAL_STRIDE_0, 2665 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2666 struct brw_reg src1 = brw_reg(src.file, src.nr, 2, 2667 BRW_REGISTER_TYPE_F, 2668 BRW_VERTICAL_STRIDE_4, 2669 BRW_WIDTH_4, 2670 BRW_HORIZONTAL_STRIDE_0, 2671 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 2672 brw_ADD(p, dst, src0, negate(src1)); 2673} 2674 2675void 2676fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask) 2677{ 2678 if (intel->gen >= 6) { 2679 /* Gen6 no longer has the mask reg for us to just read the 2680 * active channels from. However, cmp updates just the channels 2681 * of the flag reg that are enabled, so we can get at the 2682 * channel enables that way. In this step, make a reg of ones 2683 * we'll compare to. 2684 */ 2685 brw_MOV(p, mask, brw_imm_ud(1)); 2686 } else { 2687 brw_push_insn_state(p); 2688 brw_set_mask_control(p, BRW_MASK_DISABLE); 2689 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2690 brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */ 2691 brw_pop_insn_state(p); 2692 } 2693} 2694 2695void 2696fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask) 2697{ 2698 if (intel->gen >= 6) { 2699 struct brw_reg f0 = brw_flag_reg(); 2700 struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); 2701 2702 brw_push_insn_state(p); 2703 brw_set_mask_control(p, BRW_MASK_DISABLE); 2704 brw_MOV(p, f0, brw_imm_uw(0xffff)); /* inactive channels undiscarded */ 2705 brw_pop_insn_state(p); 2706 2707 brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), 2708 BRW_CONDITIONAL_Z, mask, brw_imm_ud(0)); /* active channels fail test */ 2709 /* Undo CMP's whacking of predication*/ 2710 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 2711 2712 brw_push_insn_state(p); 2713 brw_set_mask_control(p, BRW_MASK_DISABLE); 2714 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2715 brw_AND(p, g1, f0, g1); 2716 brw_pop_insn_state(p); 2717 } else { 2718 struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); 2719 2720 mask = brw_uw1_reg(mask.file, mask.nr, 0); 2721 2722 brw_push_insn_state(p); 2723 brw_set_mask_control(p, BRW_MASK_DISABLE); 2724 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 2725 brw_AND(p, g0, mask, g0); 2726 brw_pop_insn_state(p); 2727 } 2728} 2729 2730void 2731fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src) 2732{ 2733 assert(inst->mlen != 0); 2734 2735 brw_MOV(p, 2736 retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD), 2737 retype(src, BRW_REGISTER_TYPE_UD)); 2738 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1, 2739 inst->offset); 2740} 2741 2742void 2743fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst) 2744{ 2745 assert(inst->mlen != 0); 2746 2747 /* Clear any post destination dependencies that would be ignored by 2748 * the block read. See the B-Spec for pre-gen5 send instruction. 2749 * 2750 * This could use a better solution, since texture sampling and 2751 * math reads could potentially run into it as well -- anywhere 2752 * that we have a SEND with a destination that is a register that 2753 * was written but not read within the last N instructions (what's 2754 * N? unsure). This is rare because of dead code elimination, but 2755 * not impossible. 2756 */ 2757 if (intel->gen == 4 && !intel->is_g4x) 2758 brw_MOV(p, brw_null_reg(), dst); 2759 2760 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1, 2761 inst->offset); 2762 2763 if (intel->gen == 4 && !intel->is_g4x) { 2764 /* gen4 errata: destination from a send can't be used as a 2765 * destination until it's been read. Just read it so we don't 2766 * have to worry. 2767 */ 2768 brw_MOV(p, brw_null_reg(), dst); 2769 } 2770} 2771 2772 2773void 2774fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst) 2775{ 2776 assert(inst->mlen != 0); 2777 2778 /* Clear any post destination dependencies that would be ignored by 2779 * the block read. See the B-Spec for pre-gen5 send instruction. 2780 * 2781 * This could use a better solution, since texture sampling and 2782 * math reads could potentially run into it as well -- anywhere 2783 * that we have a SEND with a destination that is a register that 2784 * was written but not read within the last N instructions (what's 2785 * N? unsure). This is rare because of dead code elimination, but 2786 * not impossible. 2787 */ 2788 if (intel->gen == 4 && !intel->is_g4x) 2789 brw_MOV(p, brw_null_reg(), dst); 2790 2791 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf), 2792 inst->offset, SURF_INDEX_FRAG_CONST_BUFFER); 2793 2794 if (intel->gen == 4 && !intel->is_g4x) { 2795 /* gen4 errata: destination from a send can't be used as a 2796 * destination until it's been read. Just read it so we don't 2797 * have to worry. 2798 */ 2799 brw_MOV(p, brw_null_reg(), dst); 2800 } 2801} 2802 2803/** 2804 * To be called after the last _mesa_add_state_reference() call, to 2805 * set up prog_data.param[] for assign_curb_setup() and 2806 * setup_pull_constants(). 2807 */ 2808void 2809fs_visitor::setup_paramvalues_refs() 2810{ 2811 if (c->dispatch_width != 8) 2812 return; 2813 2814 /* Set up the pointers to ParamValues now that that array is finalized. */ 2815 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { 2816 c->prog_data.param[i] = 2817 fp->Base.Parameters->ParameterValues[this->param_index[i]] + 2818 this->param_offset[i]; 2819 } 2820} 2821 2822void 2823fs_visitor::assign_curb_setup() 2824{ 2825 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8; 2826 if (c->dispatch_width == 8) { 2827 c->prog_data.first_curbe_grf = c->nr_payload_regs; 2828 } else { 2829 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs; 2830 } 2831 2832 /* Map the offsets in the UNIFORM file to fixed HW regs. */ 2833 foreach_iter(exec_list_iterator, iter, this->instructions) { 2834 fs_inst *inst = (fs_inst *)iter.get(); 2835 2836 for (unsigned int i = 0; i < 3; i++) { 2837 if (inst->src[i].file == UNIFORM) { 2838 int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 2839 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs + 2840 constant_nr / 8, 2841 constant_nr % 8); 2842 2843 inst->src[i].file = FIXED_HW_REG; 2844 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type); 2845 } 2846 } 2847 } 2848} 2849 2850void 2851fs_visitor::calculate_urb_setup() 2852{ 2853 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 2854 urb_setup[i] = -1; 2855 } 2856 2857 int urb_next = 0; 2858 /* Figure out where each of the incoming setup attributes lands. */ 2859 if (intel->gen >= 6) { 2860 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { 2861 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) { 2862 urb_setup[i] = urb_next++; 2863 } 2864 } 2865 } else { 2866 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */ 2867 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) { 2868 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) { 2869 int fp_index; 2870 2871 if (i >= VERT_RESULT_VAR0) 2872 fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0); 2873 else if (i <= VERT_RESULT_TEX7) 2874 fp_index = i; 2875 else 2876 fp_index = -1; 2877 2878 if (fp_index >= 0) 2879 urb_setup[fp_index] = urb_next++; 2880 } 2881 } 2882 } 2883 2884 /* Each attribute is 4 setup channels, each of which is half a reg. */ 2885 c->prog_data.urb_read_length = urb_next * 2; 2886} 2887 2888void 2889fs_visitor::assign_urb_setup() 2890{ 2891 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length; 2892 2893 /* Offset all the urb_setup[] index by the actual position of the 2894 * setup regs, now that the location of the constants has been chosen. 2895 */ 2896 foreach_iter(exec_list_iterator, iter, this->instructions) { 2897 fs_inst *inst = (fs_inst *)iter.get(); 2898 2899 if (inst->opcode == FS_OPCODE_LINTERP) { 2900 assert(inst->src[2].file == FIXED_HW_REG); 2901 inst->src[2].fixed_hw_reg.nr += urb_start; 2902 } 2903 2904 if (inst->opcode == FS_OPCODE_CINTERP) { 2905 assert(inst->src[0].file == FIXED_HW_REG); 2906 inst->src[0].fixed_hw_reg.nr += urb_start; 2907 } 2908 } 2909 2910 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length; 2911} 2912 2913/** 2914 * Split large virtual GRFs into separate components if we can. 2915 * 2916 * This is mostly duplicated with what brw_fs_vector_splitting does, 2917 * but that's really conservative because it's afraid of doing 2918 * splitting that doesn't result in real progress after the rest of 2919 * the optimization phases, which would cause infinite looping in 2920 * optimization. We can do it once here, safely. This also has the 2921 * opportunity to split interpolated values, or maybe even uniforms, 2922 * which we don't have at the IR level. 2923 * 2924 * We want to split, because virtual GRFs are what we register 2925 * allocate and spill (due to contiguousness requirements for some 2926 * instructions), and they're what we naturally generate in the 2927 * codegen process, but most virtual GRFs don't actually need to be 2928 * contiguous sets of GRFs. If we split, we'll end up with reduced 2929 * live intervals and better dead code elimination and coalescing. 2930 */ 2931void 2932fs_visitor::split_virtual_grfs() 2933{ 2934 int num_vars = this->virtual_grf_next; 2935 bool split_grf[num_vars]; 2936 int new_virtual_grf[num_vars]; 2937 2938 /* Try to split anything > 0 sized. */ 2939 for (int i = 0; i < num_vars; i++) { 2940 if (this->virtual_grf_sizes[i] != 1) 2941 split_grf[i] = true; 2942 else 2943 split_grf[i] = false; 2944 } 2945 2946 if (brw->has_pln) { 2947 /* PLN opcodes rely on the delta_xy being contiguous. */ 2948 split_grf[this->delta_x.reg] = false; 2949 } 2950 2951 foreach_iter(exec_list_iterator, iter, this->instructions) { 2952 fs_inst *inst = (fs_inst *)iter.get(); 2953 2954 /* Texturing produces 4 contiguous registers, so no splitting. */ 2955 if (inst->is_tex()) { 2956 split_grf[inst->dst.reg] = false; 2957 } 2958 } 2959 2960 /* Allocate new space for split regs. Note that the virtual 2961 * numbers will be contiguous. 2962 */ 2963 for (int i = 0; i < num_vars; i++) { 2964 if (split_grf[i]) { 2965 new_virtual_grf[i] = virtual_grf_alloc(1); 2966 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) { 2967 int reg = virtual_grf_alloc(1); 2968 assert(reg == new_virtual_grf[i] + j - 1); 2969 (void) reg; 2970 } 2971 this->virtual_grf_sizes[i] = 1; 2972 } 2973 } 2974 2975 foreach_iter(exec_list_iterator, iter, this->instructions) { 2976 fs_inst *inst = (fs_inst *)iter.get(); 2977 2978 if (inst->dst.file == GRF && 2979 split_grf[inst->dst.reg] && 2980 inst->dst.reg_offset != 0) { 2981 inst->dst.reg = (new_virtual_grf[inst->dst.reg] + 2982 inst->dst.reg_offset - 1); 2983 inst->dst.reg_offset = 0; 2984 } 2985 for (int i = 0; i < 3; i++) { 2986 if (inst->src[i].file == GRF && 2987 split_grf[inst->src[i].reg] && 2988 inst->src[i].reg_offset != 0) { 2989 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] + 2990 inst->src[i].reg_offset - 1); 2991 inst->src[i].reg_offset = 0; 2992 } 2993 } 2994 } 2995 this->live_intervals_valid = false; 2996} 2997 2998/** 2999 * Choose accesses from the UNIFORM file to demote to using the pull 3000 * constant buffer. 3001 * 3002 * We allow a fragment shader to have more than the specified minimum 3003 * maximum number of fragment shader uniform components (64). If 3004 * there are too many of these, they'd fill up all of register space. 3005 * So, this will push some of them out to the pull constant buffer and 3006 * update the program to load them. 3007 */ 3008void 3009fs_visitor::setup_pull_constants() 3010{ 3011 /* Only allow 16 registers (128 uniform components) as push constants. */ 3012 unsigned int max_uniform_components = 16 * 8; 3013 if (c->prog_data.nr_params <= max_uniform_components) 3014 return; 3015 3016 if (c->dispatch_width == 16) { 3017 fail("Pull constants not supported in 16-wide\n"); 3018 return; 3019 } 3020 3021 /* Just demote the end of the list. We could probably do better 3022 * here, demoting things that are rarely used in the program first. 3023 */ 3024 int pull_uniform_base = max_uniform_components; 3025 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base; 3026 3027 foreach_iter(exec_list_iterator, iter, this->instructions) { 3028 fs_inst *inst = (fs_inst *)iter.get(); 3029 3030 for (int i = 0; i < 3; i++) { 3031 if (inst->src[i].file != UNIFORM) 3032 continue; 3033 3034 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset; 3035 if (uniform_nr < pull_uniform_base) 3036 continue; 3037 3038 fs_reg dst = fs_reg(this, glsl_type::float_type); 3039 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD, 3040 dst); 3041 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15; 3042 pull->ir = inst->ir; 3043 pull->annotation = inst->annotation; 3044 pull->base_mrf = 14; 3045 pull->mlen = 1; 3046 3047 inst->insert_before(pull); 3048 3049 inst->src[i].file = GRF; 3050 inst->src[i].reg = dst.reg; 3051 inst->src[i].reg_offset = 0; 3052 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3; 3053 } 3054 } 3055 3056 for (int i = 0; i < pull_uniform_count; i++) { 3057 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i]; 3058 c->prog_data.pull_param_convert[i] = 3059 c->prog_data.param_convert[pull_uniform_base + i]; 3060 } 3061 c->prog_data.nr_params -= pull_uniform_count; 3062 c->prog_data.nr_pull_params = pull_uniform_count; 3063} 3064 3065void 3066fs_visitor::calculate_live_intervals() 3067{ 3068 int num_vars = this->virtual_grf_next; 3069 int *def = ralloc_array(mem_ctx, int, num_vars); 3070 int *use = ralloc_array(mem_ctx, int, num_vars); 3071 int loop_depth = 0; 3072 int loop_start = 0; 3073 int bb_header_ip = 0; 3074 3075 if (this->live_intervals_valid) 3076 return; 3077 3078 for (int i = 0; i < num_vars; i++) { 3079 def[i] = MAX_INSTRUCTION; 3080 use[i] = -1; 3081 } 3082 3083 int ip = 0; 3084 foreach_iter(exec_list_iterator, iter, this->instructions) { 3085 fs_inst *inst = (fs_inst *)iter.get(); 3086 3087 if (inst->opcode == BRW_OPCODE_DO) { 3088 if (loop_depth++ == 0) 3089 loop_start = ip; 3090 } else if (inst->opcode == BRW_OPCODE_WHILE) { 3091 loop_depth--; 3092 3093 if (loop_depth == 0) { 3094 /* Patches up the use of vars marked for being live across 3095 * the whole loop. 3096 */ 3097 for (int i = 0; i < num_vars; i++) { 3098 if (use[i] == loop_start) { 3099 use[i] = ip; 3100 } 3101 } 3102 } 3103 } else { 3104 for (unsigned int i = 0; i < 3; i++) { 3105 if (inst->src[i].file == GRF && inst->src[i].reg != 0) { 3106 int reg = inst->src[i].reg; 3107 3108 if (!loop_depth) { 3109 use[reg] = ip; 3110 } else { 3111 def[reg] = MIN2(loop_start, def[reg]); 3112 use[reg] = loop_start; 3113 3114 /* Nobody else is going to go smash our start to 3115 * later in the loop now, because def[reg] now 3116 * points before the bb header. 3117 */ 3118 } 3119 } 3120 } 3121 if (inst->dst.file == GRF && inst->dst.reg != 0) { 3122 int reg = inst->dst.reg; 3123 3124 if (!loop_depth) { 3125 def[reg] = MIN2(def[reg], ip); 3126 } else { 3127 def[reg] = MIN2(def[reg], loop_start); 3128 } 3129 } 3130 } 3131 3132 ip++; 3133 3134 /* Set the basic block header IP. This is used for determining 3135 * if a complete def of single-register virtual GRF in a loop 3136 * dominates a use in the same basic block. It's a quick way to 3137 * reduce the live interval range of most register used in a 3138 * loop. 3139 */ 3140 if (inst->opcode == BRW_OPCODE_IF || 3141 inst->opcode == BRW_OPCODE_ELSE || 3142 inst->opcode == BRW_OPCODE_ENDIF || 3143 inst->opcode == BRW_OPCODE_DO || 3144 inst->opcode == BRW_OPCODE_WHILE || 3145 inst->opcode == BRW_OPCODE_BREAK || 3146 inst->opcode == BRW_OPCODE_CONTINUE) { 3147 bb_header_ip = ip; 3148 } 3149 } 3150 3151 ralloc_free(this->virtual_grf_def); 3152 ralloc_free(this->virtual_grf_use); 3153 this->virtual_grf_def = def; 3154 this->virtual_grf_use = use; 3155 3156 this->live_intervals_valid = true; 3157} 3158 3159/** 3160 * Attempts to move immediate constants into the immediate 3161 * constant slot of following instructions. 3162 * 3163 * Immediate constants are a bit tricky -- they have to be in the last 3164 * operand slot, you can't do abs/negate on them, 3165 */ 3166 3167bool 3168fs_visitor::propagate_constants() 3169{ 3170 bool progress = false; 3171 3172 calculate_live_intervals(); 3173 3174 foreach_iter(exec_list_iterator, iter, this->instructions) { 3175 fs_inst *inst = (fs_inst *)iter.get(); 3176 3177 if (inst->opcode != BRW_OPCODE_MOV || 3178 inst->predicated || 3179 inst->dst.file != GRF || inst->src[0].file != IMM || 3180 inst->dst.type != inst->src[0].type || 3181 (c->dispatch_width == 16 && 3182 (inst->force_uncompressed || inst->force_sechalf))) 3183 continue; 3184 3185 /* Don't bother with cases where we should have had the 3186 * operation on the constant folded in GLSL already. 3187 */ 3188 if (inst->saturate) 3189 continue; 3190 3191 /* Found a move of a constant to a GRF. Find anything else using the GRF 3192 * before it's written, and replace it with the constant if we can. 3193 */ 3194 exec_list_iterator scan_iter = iter; 3195 scan_iter.next(); 3196 for (; scan_iter.has_next(); scan_iter.next()) { 3197 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 3198 3199 if (scan_inst->opcode == BRW_OPCODE_DO || 3200 scan_inst->opcode == BRW_OPCODE_WHILE || 3201 scan_inst->opcode == BRW_OPCODE_ELSE || 3202 scan_inst->opcode == BRW_OPCODE_ENDIF) { 3203 break; 3204 } 3205 3206 for (int i = 2; i >= 0; i--) { 3207 if (scan_inst->src[i].file != GRF || 3208 scan_inst->src[i].reg != inst->dst.reg || 3209 scan_inst->src[i].reg_offset != inst->dst.reg_offset) 3210 continue; 3211 3212 /* Don't bother with cases where we should have had the 3213 * operation on the constant folded in GLSL already. 3214 */ 3215 if (scan_inst->src[i].negate || scan_inst->src[i].abs) 3216 continue; 3217 3218 switch (scan_inst->opcode) { 3219 case BRW_OPCODE_MOV: 3220 scan_inst->src[i] = inst->src[0]; 3221 progress = true; 3222 break; 3223 3224 case BRW_OPCODE_MUL: 3225 case BRW_OPCODE_ADD: 3226 if (i == 1) { 3227 scan_inst->src[i] = inst->src[0]; 3228 progress = true; 3229 } else if (i == 0 && scan_inst->src[1].file != IMM) { 3230 /* Fit this constant in by commuting the operands */ 3231 scan_inst->src[0] = scan_inst->src[1]; 3232 scan_inst->src[1] = inst->src[0]; 3233 progress = true; 3234 } 3235 break; 3236 3237 case BRW_OPCODE_CMP: 3238 if (i == 1) { 3239 scan_inst->src[i] = inst->src[0]; 3240 progress = true; 3241 } else if (i == 0 && scan_inst->src[1].file != IMM) { 3242 uint32_t new_cmod; 3243 3244 new_cmod = brw_swap_cmod(scan_inst->conditional_mod); 3245 if (new_cmod != ~0u) { 3246 /* Fit this constant in by swapping the operands and 3247 * flipping the test 3248 */ 3249 scan_inst->src[0] = scan_inst->src[1]; 3250 scan_inst->src[1] = inst->src[0]; 3251 scan_inst->conditional_mod = new_cmod; 3252 progress = true; 3253 } 3254 } 3255 break; 3256 3257 case BRW_OPCODE_SEL: 3258 if (i == 1) { 3259 scan_inst->src[i] = inst->src[0]; 3260 progress = true; 3261 } else if (i == 0 && scan_inst->src[1].file != IMM) { 3262 /* Fit this constant in by swapping the operands and 3263 * flipping the predicate 3264 */ 3265 scan_inst->src[0] = scan_inst->src[1]; 3266 scan_inst->src[1] = inst->src[0]; 3267 scan_inst->predicate_inverse = !scan_inst->predicate_inverse; 3268 progress = true; 3269 } 3270 break; 3271 } 3272 } 3273 3274 if (scan_inst->dst.file == GRF && 3275 scan_inst->dst.reg == inst->dst.reg && 3276 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 3277 scan_inst->is_tex())) { 3278 break; 3279 } 3280 } 3281 } 3282 3283 if (progress) 3284 this->live_intervals_valid = false; 3285 3286 return progress; 3287} 3288/** 3289 * Must be called after calculate_live_intervales() to remove unused 3290 * writes to registers -- register allocation will fail otherwise 3291 * because something deffed but not used won't be considered to 3292 * interfere with other regs. 3293 */ 3294bool 3295fs_visitor::dead_code_eliminate() 3296{ 3297 bool progress = false; 3298 int pc = 0; 3299 3300 calculate_live_intervals(); 3301 3302 foreach_iter(exec_list_iterator, iter, this->instructions) { 3303 fs_inst *inst = (fs_inst *)iter.get(); 3304 3305 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) { 3306 inst->remove(); 3307 progress = true; 3308 } 3309 3310 pc++; 3311 } 3312 3313 if (progress) 3314 live_intervals_valid = false; 3315 3316 return progress; 3317} 3318 3319bool 3320fs_visitor::register_coalesce() 3321{ 3322 bool progress = false; 3323 int if_depth = 0; 3324 int loop_depth = 0; 3325 3326 foreach_iter(exec_list_iterator, iter, this->instructions) { 3327 fs_inst *inst = (fs_inst *)iter.get(); 3328 3329 /* Make sure that we dominate the instructions we're going to 3330 * scan for interfering with our coalescing, or we won't have 3331 * scanned enough to see if anything interferes with our 3332 * coalescing. We don't dominate the following instructions if 3333 * we're in a loop or an if block. 3334 */ 3335 switch (inst->opcode) { 3336 case BRW_OPCODE_DO: 3337 loop_depth++; 3338 break; 3339 case BRW_OPCODE_WHILE: 3340 loop_depth--; 3341 break; 3342 case BRW_OPCODE_IF: 3343 if_depth++; 3344 break; 3345 case BRW_OPCODE_ENDIF: 3346 if_depth--; 3347 break; 3348 } 3349 if (loop_depth || if_depth) 3350 continue; 3351 3352 if (inst->opcode != BRW_OPCODE_MOV || 3353 inst->predicated || 3354 inst->saturate || 3355 inst->dst.file != GRF || inst->src[0].file != GRF || 3356 inst->dst.type != inst->src[0].type) 3357 continue; 3358 3359 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate; 3360 3361 /* Found a move of a GRF to a GRF. Let's see if we can coalesce 3362 * them: check for no writes to either one until the exit of the 3363 * program. 3364 */ 3365 bool interfered = false; 3366 exec_list_iterator scan_iter = iter; 3367 scan_iter.next(); 3368 for (; scan_iter.has_next(); scan_iter.next()) { 3369 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 3370 3371 if (scan_inst->dst.file == GRF) { 3372 if (scan_inst->dst.reg == inst->dst.reg && 3373 (scan_inst->dst.reg_offset == inst->dst.reg_offset || 3374 scan_inst->is_tex())) { 3375 interfered = true; 3376 break; 3377 } 3378 if (scan_inst->dst.reg == inst->src[0].reg && 3379 (scan_inst->dst.reg_offset == inst->src[0].reg_offset || 3380 scan_inst->is_tex())) { 3381 interfered = true; 3382 break; 3383 } 3384 } 3385 3386 /* The gen6 MATH instruction can't handle source modifiers, so avoid 3387 * coalescing those for now. We should do something more specific. 3388 */ 3389 if (intel->gen == 6 && scan_inst->is_math() && has_source_modifiers) { 3390 interfered = true; 3391 break; 3392 } 3393 } 3394 if (interfered) { 3395 continue; 3396 } 3397 3398 /* Rewrite the later usage to point at the source of the move to 3399 * be removed. 3400 */ 3401 for (exec_list_iterator scan_iter = iter; scan_iter.has_next(); 3402 scan_iter.next()) { 3403 fs_inst *scan_inst = (fs_inst *)scan_iter.get(); 3404 3405 for (int i = 0; i < 3; i++) { 3406 if (scan_inst->src[i].file == GRF && 3407 scan_inst->src[i].reg == inst->dst.reg && 3408 scan_inst->src[i].reg_offset == inst->dst.reg_offset) { 3409 scan_inst->src[i].reg = inst->src[0].reg; 3410 scan_inst->src[i].reg_offset = inst->src[0].reg_offset; 3411 scan_inst->src[i].abs |= inst->src[0].abs; 3412 scan_inst->src[i].negate ^= inst->src[0].negate; 3413 scan_inst->src[i].smear = inst->src[0].smear; 3414 } 3415 } 3416 } 3417 3418 inst->remove(); 3419 progress = true; 3420 } 3421 3422 if (progress) 3423 live_intervals_valid = false; 3424 3425 return progress; 3426} 3427 3428 3429bool 3430fs_visitor::compute_to_mrf() 3431{ 3432 bool progress = false; 3433 int next_ip = 0; 3434 3435 calculate_live_intervals(); 3436 3437 foreach_iter(exec_list_iterator, iter, this->instructions) { 3438 fs_inst *inst = (fs_inst *)iter.get(); 3439 3440 int ip = next_ip; 3441 next_ip++; 3442 3443 if (inst->opcode != BRW_OPCODE_MOV || 3444 inst->predicated || 3445 inst->dst.file != MRF || inst->src[0].file != GRF || 3446 inst->dst.type != inst->src[0].type || 3447 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1) 3448 continue; 3449 3450 /* Work out which hardware MRF registers are written by this 3451 * instruction. 3452 */ 3453 int mrf_low = inst->dst.hw_reg & ~BRW_MRF_COMPR4; 3454 int mrf_high; 3455 if (inst->dst.hw_reg & BRW_MRF_COMPR4) { 3456 mrf_high = mrf_low + 4; 3457 } else if (c->dispatch_width == 16 && 3458 (!inst->force_uncompressed && !inst->force_sechalf)) { 3459 mrf_high = mrf_low + 1; 3460 } else { 3461 mrf_high = mrf_low; 3462 } 3463 3464 /* Can't compute-to-MRF this GRF if someone else was going to 3465 * read it later. 3466 */ 3467 if (this->virtual_grf_use[inst->src[0].reg] > ip) 3468 continue; 3469 3470 /* Found a move of a GRF to a MRF. Let's see if we can go 3471 * rewrite the thing that made this GRF to write into the MRF. 3472 */ 3473 fs_inst *scan_inst; 3474 for (scan_inst = (fs_inst *)inst->prev; 3475 scan_inst->prev != NULL; 3476 scan_inst = (fs_inst *)scan_inst->prev) { 3477 if (scan_inst->dst.file == GRF && 3478 scan_inst->dst.reg == inst->src[0].reg) { 3479 /* Found the last thing to write our reg we want to turn 3480 * into a compute-to-MRF. 3481 */ 3482 3483 if (scan_inst->is_tex()) { 3484 /* texturing writes several continuous regs, so we can't 3485 * compute-to-mrf that. 3486 */ 3487 break; 3488 } 3489 3490 /* If it's predicated, it (probably) didn't populate all 3491 * the channels. We might be able to rewrite everything 3492 * that writes that reg, but it would require smarter 3493 * tracking to delay the rewriting until complete success. 3494 */ 3495 if (scan_inst->predicated) 3496 break; 3497 3498 /* If it's half of register setup and not the same half as 3499 * our MOV we're trying to remove, bail for now. 3500 */ 3501 if (scan_inst->force_uncompressed != inst->force_uncompressed || 3502 scan_inst->force_sechalf != inst->force_sechalf) { 3503 break; 3504 } 3505 3506 /* SEND instructions can't have MRF as a destination. */ 3507 if (scan_inst->mlen) 3508 break; 3509 3510 if (intel->gen >= 6) { 3511 /* gen6 math instructions must have the destination be 3512 * GRF, so no compute-to-MRF for them. 3513 */ 3514 if (scan_inst->is_math()) { 3515 break; 3516 } 3517 } 3518 3519 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) { 3520 /* Found the creator of our MRF's source value. */ 3521 scan_inst->dst.file = MRF; 3522 scan_inst->dst.hw_reg = inst->dst.hw_reg; 3523 scan_inst->saturate |= inst->saturate; 3524 inst->remove(); 3525 progress = true; 3526 } 3527 break; 3528 } 3529 3530 /* We don't handle flow control here. Most computation of 3531 * values that end up in MRFs are shortly before the MRF 3532 * write anyway. 3533 */ 3534 if (scan_inst->opcode == BRW_OPCODE_DO || 3535 scan_inst->opcode == BRW_OPCODE_WHILE || 3536 scan_inst->opcode == BRW_OPCODE_ELSE || 3537 scan_inst->opcode == BRW_OPCODE_ENDIF) { 3538 break; 3539 } 3540 3541 /* You can't read from an MRF, so if someone else reads our 3542 * MRF's source GRF that we wanted to rewrite, that stops us. 3543 */ 3544 bool interfered = false; 3545 for (int i = 0; i < 3; i++) { 3546 if (scan_inst->src[i].file == GRF && 3547 scan_inst->src[i].reg == inst->src[0].reg && 3548 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { 3549 interfered = true; 3550 } 3551 } 3552 if (interfered) 3553 break; 3554 3555 if (scan_inst->dst.file == MRF) { 3556 /* If somebody else writes our MRF here, we can't 3557 * compute-to-MRF before that. 3558 */ 3559 int scan_mrf_low = scan_inst->dst.hw_reg & ~BRW_MRF_COMPR4; 3560 int scan_mrf_high; 3561 3562 if (scan_inst->dst.hw_reg & BRW_MRF_COMPR4) { 3563 scan_mrf_high = scan_mrf_low + 4; 3564 } else if (c->dispatch_width == 16 && 3565 (!scan_inst->force_uncompressed && 3566 !scan_inst->force_sechalf)) { 3567 scan_mrf_high = scan_mrf_low + 1; 3568 } else { 3569 scan_mrf_high = scan_mrf_low; 3570 } 3571 3572 if (mrf_low == scan_mrf_low || 3573 mrf_low == scan_mrf_high || 3574 mrf_high == scan_mrf_low || 3575 mrf_high == scan_mrf_high) { 3576 break; 3577 } 3578 } 3579 3580 if (scan_inst->mlen > 0) { 3581 /* Found a SEND instruction, which means that there are 3582 * live values in MRFs from base_mrf to base_mrf + 3583 * scan_inst->mlen - 1. Don't go pushing our MRF write up 3584 * above it. 3585 */ 3586 if (mrf_low >= scan_inst->base_mrf && 3587 mrf_low < scan_inst->base_mrf + scan_inst->mlen) { 3588 break; 3589 } 3590 if (mrf_high >= scan_inst->base_mrf && 3591 mrf_high < scan_inst->base_mrf + scan_inst->mlen) { 3592 break; 3593 } 3594 } 3595 } 3596 } 3597 3598 return progress; 3599} 3600 3601/** 3602 * Walks through basic blocks, locking for repeated MRF writes and 3603 * removing the later ones. 3604 */ 3605bool 3606fs_visitor::remove_duplicate_mrf_writes() 3607{ 3608 fs_inst *last_mrf_move[16]; 3609 bool progress = false; 3610 3611 /* Need to update the MRF tracking for compressed instructions. */ 3612 if (c->dispatch_width == 16) 3613 return false; 3614 3615 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 3616 3617 foreach_iter(exec_list_iterator, iter, this->instructions) { 3618 fs_inst *inst = (fs_inst *)iter.get(); 3619 3620 switch (inst->opcode) { 3621 case BRW_OPCODE_DO: 3622 case BRW_OPCODE_WHILE: 3623 case BRW_OPCODE_IF: 3624 case BRW_OPCODE_ELSE: 3625 case BRW_OPCODE_ENDIF: 3626 memset(last_mrf_move, 0, sizeof(last_mrf_move)); 3627 continue; 3628 default: 3629 break; 3630 } 3631 3632 if (inst->opcode == BRW_OPCODE_MOV && 3633 inst->dst.file == MRF) { 3634 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg]; 3635 if (prev_inst && inst->equals(prev_inst)) { 3636 inst->remove(); 3637 progress = true; 3638 continue; 3639 } 3640 } 3641 3642 /* Clear out the last-write records for MRFs that were overwritten. */ 3643 if (inst->dst.file == MRF) { 3644 last_mrf_move[inst->dst.hw_reg] = NULL; 3645 } 3646 3647 if (inst->mlen > 0) { 3648 /* Found a SEND instruction, which will include two or fewer 3649 * implied MRF writes. We could do better here. 3650 */ 3651 for (int i = 0; i < implied_mrf_writes(inst); i++) { 3652 last_mrf_move[inst->base_mrf + i] = NULL; 3653 } 3654 } 3655 3656 /* Clear out any MRF move records whose sources got overwritten. */ 3657 if (inst->dst.file == GRF) { 3658 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) { 3659 if (last_mrf_move[i] && 3660 last_mrf_move[i]->src[0].reg == inst->dst.reg) { 3661 last_mrf_move[i] = NULL; 3662 } 3663 } 3664 } 3665 3666 if (inst->opcode == BRW_OPCODE_MOV && 3667 inst->dst.file == MRF && 3668 inst->src[0].file == GRF && 3669 !inst->predicated) { 3670 last_mrf_move[inst->dst.hw_reg] = inst; 3671 } 3672 } 3673 3674 return progress; 3675} 3676 3677bool 3678fs_visitor::virtual_grf_interferes(int a, int b) 3679{ 3680 int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]); 3681 int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]); 3682 3683 /* We can't handle dead register writes here, without iterating 3684 * over the whole instruction stream to find every single dead 3685 * write to that register to compare to the live interval of the 3686 * other register. Just assert that dead_code_eliminate() has been 3687 * called. 3688 */ 3689 assert((this->virtual_grf_use[a] != -1 || 3690 this->virtual_grf_def[a] == MAX_INSTRUCTION) && 3691 (this->virtual_grf_use[b] != -1 || 3692 this->virtual_grf_def[b] == MAX_INSTRUCTION)); 3693 3694 /* If the register is used to store 16 values of less than float 3695 * size (only the case for pixel_[xy]), then we can't allocate 3696 * another dword-sized thing to that register that would be used in 3697 * the same instruction. This is because when the GPU decodes (for 3698 * example): 3699 * 3700 * (declare (in ) vec4 gl_FragCoord@0x97766a0) 3701 * add(16) g6<1>F g6<8,8,1>UW 0.5F { align1 compr }; 3702 * 3703 * it's actually processed as: 3704 * add(8) g6<1>F g6<8,8,1>UW 0.5F { align1 }; 3705 * add(8) g7<1>F g6.8<8,8,1>UW 0.5F { align1 sechalf }; 3706 * 3707 * so our second half values in g6 got overwritten in the first 3708 * half. 3709 */ 3710 if (c->dispatch_width == 16 && (this->pixel_x.reg == a || 3711 this->pixel_x.reg == b || 3712 this->pixel_y.reg == a || 3713 this->pixel_y.reg == b)) { 3714 return start <= end; 3715 } 3716 3717 return start < end; 3718} 3719 3720static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg) 3721{ 3722 struct brw_reg brw_reg; 3723 3724 switch (reg->file) { 3725 case GRF: 3726 case ARF: 3727 case MRF: 3728 if (reg->smear == -1) { 3729 brw_reg = brw_vec8_reg(reg->file, 3730 reg->hw_reg, 0); 3731 } else { 3732 brw_reg = brw_vec1_reg(reg->file, 3733 reg->hw_reg, reg->smear); 3734 } 3735 brw_reg = retype(brw_reg, reg->type); 3736 if (reg->sechalf) 3737 brw_reg = sechalf(brw_reg); 3738 break; 3739 case IMM: 3740 switch (reg->type) { 3741 case BRW_REGISTER_TYPE_F: 3742 brw_reg = brw_imm_f(reg->imm.f); 3743 break; 3744 case BRW_REGISTER_TYPE_D: 3745 brw_reg = brw_imm_d(reg->imm.i); 3746 break; 3747 case BRW_REGISTER_TYPE_UD: 3748 brw_reg = brw_imm_ud(reg->imm.u); 3749 break; 3750 default: 3751 assert(!"not reached"); 3752 brw_reg = brw_null_reg(); 3753 break; 3754 } 3755 break; 3756 case FIXED_HW_REG: 3757 brw_reg = reg->fixed_hw_reg; 3758 break; 3759 case BAD_FILE: 3760 /* Probably unused. */ 3761 brw_reg = brw_null_reg(); 3762 break; 3763 case UNIFORM: 3764 assert(!"not reached"); 3765 brw_reg = brw_null_reg(); 3766 break; 3767 default: 3768 assert(!"not reached"); 3769 brw_reg = brw_null_reg(); 3770 break; 3771 } 3772 if (reg->abs) 3773 brw_reg = brw_abs(brw_reg); 3774 if (reg->negate) 3775 brw_reg = negate(brw_reg); 3776 3777 return brw_reg; 3778} 3779 3780void 3781fs_visitor::generate_code() 3782{ 3783 int last_native_inst = p->nr_insn; 3784 const char *last_annotation_string = NULL; 3785 ir_instruction *last_annotation_ir = NULL; 3786 3787 int loop_stack_array_size = 16; 3788 int loop_stack_depth = 0; 3789 brw_instruction **loop_stack = 3790 rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size); 3791 int *if_depth_in_loop = 3792 rzalloc_array(this->mem_ctx, int, loop_stack_array_size); 3793 3794 3795 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3796 printf("Native code for fragment shader %d (%d-wide dispatch):\n", 3797 ctx->Shader.CurrentFragmentProgram->Name, c->dispatch_width); 3798 } 3799 3800 foreach_iter(exec_list_iterator, iter, this->instructions) { 3801 fs_inst *inst = (fs_inst *)iter.get(); 3802 struct brw_reg src[3], dst; 3803 3804 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 3805 if (last_annotation_ir != inst->ir) { 3806 last_annotation_ir = inst->ir; 3807 if (last_annotation_ir) { 3808 printf(" "); 3809 last_annotation_ir->print(); 3810 printf("\n"); 3811 } 3812 } 3813 if (last_annotation_string != inst->annotation) { 3814 last_annotation_string = inst->annotation; 3815 if (last_annotation_string) 3816 printf(" %s\n", last_annotation_string); 3817 } 3818 } 3819 3820 for (unsigned int i = 0; i < 3; i++) { 3821 src[i] = brw_reg_from_fs_reg(&inst->src[i]); 3822 } 3823 dst = brw_reg_from_fs_reg(&inst->dst); 3824 3825 brw_set_conditionalmod(p, inst->conditional_mod); 3826 brw_set_predicate_control(p, inst->predicated); 3827 brw_set_predicate_inverse(p, inst->predicate_inverse); 3828 brw_set_saturate(p, inst->saturate); 3829 3830 if (inst->force_uncompressed || c->dispatch_width == 8) { 3831 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 3832 } else if (inst->force_sechalf) { 3833 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); 3834 } else { 3835 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 3836 } 3837 3838 switch (inst->opcode) { 3839 case BRW_OPCODE_MOV: 3840 brw_MOV(p, dst, src[0]); 3841 break; 3842 case BRW_OPCODE_ADD: 3843 brw_ADD(p, dst, src[0], src[1]); 3844 break; 3845 case BRW_OPCODE_MUL: 3846 brw_MUL(p, dst, src[0], src[1]); 3847 break; 3848 3849 case BRW_OPCODE_FRC: 3850 brw_FRC(p, dst, src[0]); 3851 break; 3852 case BRW_OPCODE_RNDD: 3853 brw_RNDD(p, dst, src[0]); 3854 break; 3855 case BRW_OPCODE_RNDE: 3856 brw_RNDE(p, dst, src[0]); 3857 break; 3858 case BRW_OPCODE_RNDZ: 3859 brw_RNDZ(p, dst, src[0]); 3860 break; 3861 3862 case BRW_OPCODE_AND: 3863 brw_AND(p, dst, src[0], src[1]); 3864 break; 3865 case BRW_OPCODE_OR: 3866 brw_OR(p, dst, src[0], src[1]); 3867 break; 3868 case BRW_OPCODE_XOR: 3869 brw_XOR(p, dst, src[0], src[1]); 3870 break; 3871 case BRW_OPCODE_NOT: 3872 brw_NOT(p, dst, src[0]); 3873 break; 3874 case BRW_OPCODE_ASR: 3875 brw_ASR(p, dst, src[0], src[1]); 3876 break; 3877 case BRW_OPCODE_SHR: 3878 brw_SHR(p, dst, src[0], src[1]); 3879 break; 3880 case BRW_OPCODE_SHL: 3881 brw_SHL(p, dst, src[0], src[1]); 3882 break; 3883 3884 case BRW_OPCODE_CMP: 3885 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); 3886 break; 3887 case BRW_OPCODE_SEL: 3888 brw_SEL(p, dst, src[0], src[1]); 3889 break; 3890 3891 case BRW_OPCODE_IF: 3892 if (inst->src[0].file != BAD_FILE) { 3893 assert(intel->gen >= 6); 3894 gen6_IF(p, inst->conditional_mod, src[0], src[1]); 3895 } else { 3896 brw_IF(p, BRW_EXECUTE_8); 3897 } 3898 if_depth_in_loop[loop_stack_depth]++; 3899 break; 3900 3901 case BRW_OPCODE_ELSE: 3902 brw_ELSE(p); 3903 break; 3904 case BRW_OPCODE_ENDIF: 3905 brw_ENDIF(p); 3906 if_depth_in_loop[loop_stack_depth]--; 3907 break; 3908 3909 case BRW_OPCODE_DO: 3910 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8); 3911 if (loop_stack_array_size <= loop_stack_depth) { 3912 loop_stack_array_size *= 2; 3913 loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *, 3914 loop_stack_array_size); 3915 if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int, 3916 loop_stack_array_size); 3917 } 3918 if_depth_in_loop[loop_stack_depth] = 0; 3919 break; 3920 3921 case BRW_OPCODE_BREAK: 3922 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]); 3923 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 3924 break; 3925 case BRW_OPCODE_CONTINUE: 3926 /* FINISHME: We need to write the loop instruction support still. */ 3927 if (intel->gen >= 6) 3928 gen6_CONT(p, loop_stack[loop_stack_depth - 1]); 3929 else 3930 brw_CONT(p, if_depth_in_loop[loop_stack_depth]); 3931 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 3932 break; 3933 3934 case BRW_OPCODE_WHILE: { 3935 struct brw_instruction *inst0, *inst1; 3936 GLuint br = 1; 3937 3938 if (intel->gen >= 5) 3939 br = 2; 3940 3941 assert(loop_stack_depth > 0); 3942 loop_stack_depth--; 3943 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]); 3944 if (intel->gen < 6) { 3945 /* patch all the BREAK/CONT instructions from last BGNLOOP */ 3946 while (inst0 > loop_stack[loop_stack_depth]) { 3947 inst0--; 3948 if (inst0->header.opcode == BRW_OPCODE_BREAK && 3949 inst0->bits3.if_else.jump_count == 0) { 3950 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1); 3951 } 3952 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE && 3953 inst0->bits3.if_else.jump_count == 0) { 3954 inst0->bits3.if_else.jump_count = br * (inst1 - inst0); 3955 } 3956 } 3957 } 3958 } 3959 break; 3960 3961 case FS_OPCODE_RCP: 3962 case FS_OPCODE_RSQ: 3963 case FS_OPCODE_SQRT: 3964 case FS_OPCODE_EXP2: 3965 case FS_OPCODE_LOG2: 3966 case FS_OPCODE_POW: 3967 case FS_OPCODE_SIN: 3968 case FS_OPCODE_COS: 3969 generate_math(inst, dst, src); 3970 break; 3971 case FS_OPCODE_PIXEL_X: 3972 generate_pixel_xy(dst, true); 3973 break; 3974 case FS_OPCODE_PIXEL_Y: 3975 generate_pixel_xy(dst, false); 3976 break; 3977 case FS_OPCODE_CINTERP: 3978 brw_MOV(p, dst, src[0]); 3979 break; 3980 case FS_OPCODE_LINTERP: 3981 generate_linterp(inst, dst, src); 3982 break; 3983 case FS_OPCODE_TEX: 3984 case FS_OPCODE_TXB: 3985 case FS_OPCODE_TXD: 3986 case FS_OPCODE_TXL: 3987 generate_tex(inst, dst, src[0]); 3988 break; 3989 case FS_OPCODE_DISCARD_NOT: 3990 generate_discard_not(inst, dst); 3991 break; 3992 case FS_OPCODE_DISCARD_AND: 3993 generate_discard_and(inst, src[0]); 3994 break; 3995 case FS_OPCODE_DDX: 3996 generate_ddx(inst, dst, src[0]); 3997 break; 3998 case FS_OPCODE_DDY: 3999 generate_ddy(inst, dst, src[0]); 4000 break; 4001 4002 case FS_OPCODE_SPILL: 4003 generate_spill(inst, src[0]); 4004 break; 4005 4006 case FS_OPCODE_UNSPILL: 4007 generate_unspill(inst, dst); 4008 break; 4009 4010 case FS_OPCODE_PULL_CONSTANT_LOAD: 4011 generate_pull_constant_load(inst, dst); 4012 break; 4013 4014 case FS_OPCODE_FB_WRITE: 4015 generate_fb_write(inst); 4016 break; 4017 default: 4018 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) { 4019 _mesa_problem(ctx, "Unsupported opcode `%s' in FS", 4020 brw_opcodes[inst->opcode].name); 4021 } else { 4022 _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode); 4023 } 4024 fail("unsupported opcode in FS\n"); 4025 } 4026 4027 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 4028 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) { 4029 if (0) { 4030 printf("0x%08x 0x%08x 0x%08x 0x%08x ", 4031 ((uint32_t *)&p->store[i])[3], 4032 ((uint32_t *)&p->store[i])[2], 4033 ((uint32_t *)&p->store[i])[1], 4034 ((uint32_t *)&p->store[i])[0]); 4035 } 4036 brw_disasm(stdout, &p->store[i], intel->gen); 4037 } 4038 } 4039 4040 last_native_inst = p->nr_insn; 4041 } 4042 4043 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 4044 printf("\n"); 4045 } 4046 4047 ralloc_free(loop_stack); 4048 ralloc_free(if_depth_in_loop); 4049 4050 brw_set_uip_jip(p); 4051 4052 /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS 4053 * emit issues, it doesn't get the jump distances into the output, 4054 * which is often something we want to debug. So this is here in 4055 * case you're doing that. 4056 */ 4057 if (0) { 4058 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 4059 for (unsigned int i = 0; i < p->nr_insn; i++) { 4060 printf("0x%08x 0x%08x 0x%08x 0x%08x ", 4061 ((uint32_t *)&p->store[i])[3], 4062 ((uint32_t *)&p->store[i])[2], 4063 ((uint32_t *)&p->store[i])[1], 4064 ((uint32_t *)&p->store[i])[0]); 4065 brw_disasm(stdout, &p->store[i], intel->gen); 4066 } 4067 } 4068 } 4069} 4070 4071bool 4072fs_visitor::run() 4073{ 4074 uint32_t prog_offset_16 = 0; 4075 uint32_t orig_nr_params = c->prog_data.nr_params; 4076 4077 brw_wm_payload_setup(brw, c); 4078 4079 if (c->dispatch_width == 16) { 4080 /* align to 64 byte boundary. */ 4081 while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) { 4082 brw_NOP(p); 4083 } 4084 4085 /* Save off the start of this 16-wide program in case we succeed. */ 4086 prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction); 4087 4088 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 4089 } 4090 4091 if (0) { 4092 emit_dummy_fs(); 4093 } else { 4094 calculate_urb_setup(); 4095 if (intel->gen < 6) 4096 emit_interpolation_setup_gen4(); 4097 else 4098 emit_interpolation_setup_gen6(); 4099 4100 /* Generate FS IR for main(). (the visitor only descends into 4101 * functions called "main"). 4102 */ 4103 foreach_iter(exec_list_iterator, iter, *shader->ir) { 4104 ir_instruction *ir = (ir_instruction *)iter.get(); 4105 base_ir = ir; 4106 ir->accept(this); 4107 } 4108 4109 emit_fb_writes(); 4110 4111 split_virtual_grfs(); 4112 4113 setup_paramvalues_refs(); 4114 setup_pull_constants(); 4115 4116 bool progress; 4117 do { 4118 progress = false; 4119 4120 progress = remove_duplicate_mrf_writes() || progress; 4121 4122 progress = propagate_constants() || progress; 4123 progress = register_coalesce() || progress; 4124 progress = compute_to_mrf() || progress; 4125 progress = dead_code_eliminate() || progress; 4126 } while (progress); 4127 4128 schedule_instructions(); 4129 4130 assign_curb_setup(); 4131 assign_urb_setup(); 4132 4133 if (0) { 4134 /* Debug of register spilling: Go spill everything. */ 4135 int virtual_grf_count = virtual_grf_next; 4136 for (int i = 1; i < virtual_grf_count; i++) { 4137 spill_reg(i); 4138 } 4139 } 4140 4141 if (0) 4142 assign_regs_trivial(); 4143 else { 4144 while (!assign_regs()) { 4145 if (failed) 4146 break; 4147 } 4148 } 4149 } 4150 assert(force_uncompressed_stack == 0); 4151 assert(force_sechalf_stack == 0); 4152 4153 if (failed) 4154 return false; 4155 4156 generate_code(); 4157 4158 if (c->dispatch_width == 8) { 4159 c->prog_data.total_grf = grf_used; 4160 } else { 4161 c->prog_data.total_grf_16 = grf_used; 4162 c->prog_data.prog_offset_16 = prog_offset_16; 4163 4164 /* Make sure we didn't try to sneak in an extra uniform */ 4165 assert(orig_nr_params == c->prog_data.nr_params); 4166 } 4167 4168 return !failed; 4169} 4170 4171bool 4172brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c) 4173{ 4174 struct intel_context *intel = &brw->intel; 4175 struct gl_context *ctx = &intel->ctx; 4176 struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram; 4177 4178 if (!prog) 4179 return false; 4180 4181 struct brw_shader *shader = 4182 (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; 4183 if (!shader) 4184 return false; 4185 4186 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 4187 printf("GLSL IR for native fragment shader %d:\n", prog->Name); 4188 _mesa_print_ir(shader->ir, NULL); 4189 printf("\n\n"); 4190 } 4191 4192 /* Now the main event: Visit the shader IR and generate our FS IR for it. 4193 */ 4194 c->dispatch_width = 8; 4195 4196 fs_visitor v(c, shader); 4197 if (!v.run()) { 4198 /* FINISHME: Cleanly fail, test at link time, etc. */ 4199 assert(!"not reached"); 4200 return false; 4201 } 4202 4203 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) { 4204 c->dispatch_width = 16; 4205 fs_visitor v2(c, shader); 4206 v2.import_uniforms(v.variable_ht); 4207 v2.run(); 4208 } 4209 4210 c->prog_data.dispatch_width = 8; 4211 4212 return true; 4213} 4214