brw_fs_emit.cpp revision 454dc83f66643e66ea7ee9117368211f0cfe84d7
1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24/** @file brw_fs_emit.cpp 25 * 26 * This file supports emitting code from the FS LIR to the actual 27 * native instructions. 28 */ 29 30extern "C" { 31#include "main/macros.h" 32#include "brw_context.h" 33#include "brw_eu.h" 34} /* extern "C" */ 35 36#include "brw_fs.h" 37#include "brw_fs_cfg.h" 38#include "glsl/ir_print_visitor.h" 39 40void 41fs_visitor::generate_fb_write(fs_inst *inst) 42{ 43 bool eot = inst->eot; 44 struct brw_reg implied_header; 45 uint32_t msg_control; 46 47 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied 48 * move, here's g1. 49 */ 50 brw_push_insn_state(p); 51 brw_set_mask_control(p, BRW_MASK_DISABLE); 52 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 53 54 if (inst->header_present) { 55 if (intel->gen >= 6) { 56 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 57 brw_MOV(p, 58 retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD), 59 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 60 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 61 62 if (inst->target > 0) { 63 /* Set the render target index for choosing BLEND_STATE. */ 64 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 65 inst->base_mrf, 2), 66 BRW_REGISTER_TYPE_UD), 67 brw_imm_ud(inst->target)); 68 } 69 70 implied_header = brw_null_reg(); 71 } else { 72 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); 73 74 brw_MOV(p, 75 brw_message_reg(inst->base_mrf + 1), 76 brw_vec8_grf(1, 0)); 77 } 78 } else { 79 implied_header = brw_null_reg(); 80 } 81 82 if (this->dual_src_output.file != BAD_FILE) 83 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01; 84 else if (c->dispatch_width == 16) 85 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE; 86 else 87 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01; 88 89 brw_pop_insn_state(p); 90 91 brw_fb_WRITE(p, 92 c->dispatch_width, 93 inst->base_mrf, 94 implied_header, 95 msg_control, 96 inst->target, 97 inst->mlen, 98 0, 99 eot, 100 inst->header_present); 101} 102 103/* Computes the integer pixel x,y values from the origin. 104 * 105 * This is the basis of gl_FragCoord computation, but is also used 106 * pre-gen6 for computing the deltas from v0 for computing 107 * interpolation. 108 */ 109void 110fs_visitor::generate_pixel_xy(struct brw_reg dst, bool is_x) 111{ 112 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW); 113 struct brw_reg src; 114 struct brw_reg deltas; 115 116 if (is_x) { 117 src = stride(suboffset(g1_uw, 4), 2, 4, 0); 118 deltas = brw_imm_v(0x10101010); 119 } else { 120 src = stride(suboffset(g1_uw, 5), 2, 4, 0); 121 deltas = brw_imm_v(0x11001100); 122 } 123 124 if (c->dispatch_width == 16) { 125 dst = vec16(dst); 126 } 127 128 /* We do this 8 or 16-wide, but since the destination is UW we 129 * don't do compression in the 16-wide case. 130 */ 131 brw_push_insn_state(p); 132 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 133 brw_ADD(p, dst, src, deltas); 134 brw_pop_insn_state(p); 135} 136 137void 138fs_visitor::generate_linterp(fs_inst *inst, 139 struct brw_reg dst, struct brw_reg *src) 140{ 141 struct brw_reg delta_x = src[0]; 142 struct brw_reg delta_y = src[1]; 143 struct brw_reg interp = src[2]; 144 145 if (brw->has_pln && 146 delta_y.nr == delta_x.nr + 1 && 147 (intel->gen >= 6 || (delta_x.nr & 1) == 0)) { 148 brw_PLN(p, dst, interp, delta_x); 149 } else { 150 brw_LINE(p, brw_null_reg(), interp, delta_x); 151 brw_MAC(p, dst, suboffset(interp, 1), delta_y); 152 } 153} 154 155void 156fs_visitor::generate_math1_gen7(fs_inst *inst, 157 struct brw_reg dst, 158 struct brw_reg src0) 159{ 160 assert(inst->mlen == 0); 161 brw_math(p, dst, 162 brw_math_function(inst->opcode), 163 inst->saturate ? BRW_MATH_SATURATE_SATURATE 164 : BRW_MATH_SATURATE_NONE, 165 0, src0, 166 BRW_MATH_DATA_VECTOR, 167 BRW_MATH_PRECISION_FULL); 168} 169 170void 171fs_visitor::generate_math2_gen7(fs_inst *inst, 172 struct brw_reg dst, 173 struct brw_reg src0, 174 struct brw_reg src1) 175{ 176 assert(inst->mlen == 0); 177 brw_math2(p, dst, brw_math_function(inst->opcode), src0, src1); 178} 179 180void 181fs_visitor::generate_math1_gen6(fs_inst *inst, 182 struct brw_reg dst, 183 struct brw_reg src0) 184{ 185 int op = brw_math_function(inst->opcode); 186 187 assert(inst->mlen == 0); 188 189 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 190 brw_math(p, dst, 191 op, 192 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 193 BRW_MATH_SATURATE_NONE, 194 0, src0, 195 BRW_MATH_DATA_VECTOR, 196 BRW_MATH_PRECISION_FULL); 197 198 if (c->dispatch_width == 16) { 199 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); 200 brw_math(p, sechalf(dst), 201 op, 202 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 203 BRW_MATH_SATURATE_NONE, 204 0, sechalf(src0), 205 BRW_MATH_DATA_VECTOR, 206 BRW_MATH_PRECISION_FULL); 207 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 208 } 209} 210 211void 212fs_visitor::generate_math2_gen6(fs_inst *inst, 213 struct brw_reg dst, 214 struct brw_reg src0, 215 struct brw_reg src1) 216{ 217 int op = brw_math_function(inst->opcode); 218 219 assert(inst->mlen == 0); 220 221 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 222 brw_math2(p, dst, op, src0, src1); 223 224 if (c->dispatch_width == 16) { 225 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); 226 brw_math2(p, sechalf(dst), op, sechalf(src0), sechalf(src1)); 227 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 228 } 229} 230 231void 232fs_visitor::generate_math_gen4(fs_inst *inst, 233 struct brw_reg dst, 234 struct brw_reg src) 235{ 236 int op = brw_math_function(inst->opcode); 237 238 assert(inst->mlen >= 1); 239 240 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 241 brw_math(p, dst, 242 op, 243 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 244 BRW_MATH_SATURATE_NONE, 245 inst->base_mrf, src, 246 BRW_MATH_DATA_VECTOR, 247 BRW_MATH_PRECISION_FULL); 248 249 if (c->dispatch_width == 16) { 250 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); 251 brw_math(p, sechalf(dst), 252 op, 253 inst->saturate ? BRW_MATH_SATURATE_SATURATE : 254 BRW_MATH_SATURATE_NONE, 255 inst->base_mrf + 1, sechalf(src), 256 BRW_MATH_DATA_VECTOR, 257 BRW_MATH_PRECISION_FULL); 258 259 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 260 } 261} 262 263void 264fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 265{ 266 int msg_type = -1; 267 int rlen = 4; 268 uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; 269 uint32_t return_format; 270 271 switch (dst.type) { 272 case BRW_REGISTER_TYPE_D: 273 return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32; 274 break; 275 case BRW_REGISTER_TYPE_UD: 276 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32; 277 break; 278 default: 279 return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32; 280 break; 281 } 282 283 if (c->dispatch_width == 16) 284 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 285 286 if (intel->gen >= 5) { 287 switch (inst->opcode) { 288 case SHADER_OPCODE_TEX: 289 if (inst->shadow_compare) { 290 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE; 291 } else { 292 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE; 293 } 294 break; 295 case FS_OPCODE_TXB: 296 if (inst->shadow_compare) { 297 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE; 298 } else { 299 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS; 300 } 301 break; 302 case SHADER_OPCODE_TXL: 303 if (inst->shadow_compare) { 304 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE; 305 } else { 306 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD; 307 } 308 break; 309 case SHADER_OPCODE_TXS: 310 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO; 311 break; 312 case SHADER_OPCODE_TXD: 313 /* There is no sample_d_c message; comparisons are done manually */ 314 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS; 315 break; 316 case SHADER_OPCODE_TXF: 317 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; 318 break; 319 default: 320 assert(!"not reached"); 321 break; 322 } 323 } else { 324 switch (inst->opcode) { 325 case SHADER_OPCODE_TEX: 326 /* Note that G45 and older determines shadow compare and dispatch width 327 * from message length for most messages. 328 */ 329 assert(c->dispatch_width == 8); 330 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE; 331 if (inst->shadow_compare) { 332 assert(inst->mlen == 6); 333 } else { 334 assert(inst->mlen <= 4); 335 } 336 break; 337 case FS_OPCODE_TXB: 338 if (inst->shadow_compare) { 339 assert(inst->mlen == 6); 340 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE; 341 } else { 342 assert(inst->mlen == 9); 343 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS; 344 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 345 } 346 break; 347 case SHADER_OPCODE_TXL: 348 if (inst->shadow_compare) { 349 assert(inst->mlen == 6); 350 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE; 351 } else { 352 assert(inst->mlen == 9); 353 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD; 354 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 355 } 356 break; 357 case SHADER_OPCODE_TXD: 358 /* There is no sample_d_c message; comparisons are done manually */ 359 assert(inst->mlen == 7 || inst->mlen == 10); 360 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS; 361 break; 362 case SHADER_OPCODE_TXF: 363 assert(inst->mlen == 9); 364 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD; 365 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 366 break; 367 case SHADER_OPCODE_TXS: 368 assert(inst->mlen == 3); 369 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO; 370 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 371 break; 372 default: 373 assert(!"not reached"); 374 break; 375 } 376 } 377 assert(msg_type != -1); 378 379 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) { 380 rlen = 8; 381 dst = vec16(dst); 382 } 383 384 /* Load the message header if present. If there's a texture offset, 385 * we need to set it up explicitly and load the offset bitfield. 386 * Otherwise, we can use an implied move from g0 to the first message reg. 387 */ 388 if (inst->texture_offset) { 389 brw_push_insn_state(p); 390 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 391 /* Explicitly set up the message header by copying g0 to the MRF. */ 392 brw_MOV(p, retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD), 393 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 394 395 /* Then set the offset bits in DWord 2. */ 396 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 397 inst->base_mrf, 2), BRW_REGISTER_TYPE_UD), 398 brw_imm_ud(inst->texture_offset)); 399 brw_pop_insn_state(p); 400 } else if (inst->header_present) { 401 /* Set up an implied move from g0 to the MRF. */ 402 src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); 403 } 404 405 brw_SAMPLE(p, 406 retype(dst, BRW_REGISTER_TYPE_UW), 407 inst->base_mrf, 408 src, 409 SURF_INDEX_TEXTURE(inst->sampler), 410 inst->sampler, 411 WRITEMASK_XYZW, 412 msg_type, 413 rlen, 414 inst->mlen, 415 inst->header_present, 416 simd_mode, 417 return_format); 418} 419 420 421/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input 422 * looking like: 423 * 424 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br 425 * 426 * and we're trying to produce: 427 * 428 * DDX DDY 429 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl) 430 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br) 431 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl) 432 * (ss0.br - ss0.bl) (ss0.tr - ss0.br) 433 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl) 434 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br) 435 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl) 436 * (ss1.br - ss1.bl) (ss1.tr - ss1.br) 437 * 438 * and add another set of two more subspans if in 16-pixel dispatch mode. 439 * 440 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result 441 * for each pair, and vertstride = 2 jumps us 2 elements after processing a 442 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled 443 * between each other. We could probably do it like ddx and swizzle the right 444 * order later, but bail for now and just produce 445 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4) 446 */ 447void 448fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src) 449{ 450 struct brw_reg src0 = brw_reg(src.file, src.nr, 1, 451 BRW_REGISTER_TYPE_F, 452 BRW_VERTICAL_STRIDE_2, 453 BRW_WIDTH_2, 454 BRW_HORIZONTAL_STRIDE_0, 455 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 456 struct brw_reg src1 = brw_reg(src.file, src.nr, 0, 457 BRW_REGISTER_TYPE_F, 458 BRW_VERTICAL_STRIDE_2, 459 BRW_WIDTH_2, 460 BRW_HORIZONTAL_STRIDE_0, 461 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 462 brw_ADD(p, dst, src0, negate(src1)); 463} 464 465/* The negate_value boolean is used to negate the derivative computation for 466 * FBOs, since they place the origin at the upper left instead of the lower 467 * left. 468 */ 469void 470fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src, 471 bool negate_value) 472{ 473 struct brw_reg src0 = brw_reg(src.file, src.nr, 0, 474 BRW_REGISTER_TYPE_F, 475 BRW_VERTICAL_STRIDE_4, 476 BRW_WIDTH_4, 477 BRW_HORIZONTAL_STRIDE_0, 478 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 479 struct brw_reg src1 = brw_reg(src.file, src.nr, 2, 480 BRW_REGISTER_TYPE_F, 481 BRW_VERTICAL_STRIDE_4, 482 BRW_WIDTH_4, 483 BRW_HORIZONTAL_STRIDE_0, 484 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 485 if (negate_value) 486 brw_ADD(p, dst, src1, negate(src0)); 487 else 488 brw_ADD(p, dst, src0, negate(src1)); 489} 490 491void 492fs_visitor::generate_discard(fs_inst *inst) 493{ 494 struct brw_reg f0 = brw_flag_reg(); 495 496 if (intel->gen >= 6) { 497 struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); 498 struct brw_reg some_register; 499 500 /* As of gen6, we no longer have the mask register to look at, 501 * so life gets a bit more complicated. 502 */ 503 504 /* Load the flag register with all ones. */ 505 brw_push_insn_state(p); 506 brw_set_mask_control(p, BRW_MASK_DISABLE); 507 brw_MOV(p, f0, brw_imm_uw(0xffff)); 508 brw_pop_insn_state(p); 509 510 /* Do a comparison that should always fail, to produce 0s in the flag 511 * reg where we have active channels. 512 */ 513 some_register = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); 514 brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), 515 BRW_CONDITIONAL_NZ, some_register, some_register); 516 517 /* Undo CMP's whacking of predication*/ 518 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 519 520 brw_push_insn_state(p); 521 brw_set_mask_control(p, BRW_MASK_DISABLE); 522 brw_AND(p, g1, f0, g1); 523 brw_pop_insn_state(p); 524 } else { 525 struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); 526 527 brw_push_insn_state(p); 528 brw_set_mask_control(p, BRW_MASK_DISABLE); 529 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 530 531 /* Unlike the 965, we have the mask reg, so we just need 532 * somewhere to invert that (containing channels to be disabled) 533 * so it can be ANDed with the mask of pixels still to be 534 * written. Use the flag reg for consistency with gen6+. 535 */ 536 brw_NOT(p, f0, brw_mask_reg(1)); /* IMASK */ 537 brw_AND(p, g0, f0, g0); 538 539 brw_pop_insn_state(p); 540 } 541} 542 543void 544fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src) 545{ 546 assert(inst->mlen != 0); 547 548 brw_MOV(p, 549 retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD), 550 retype(src, BRW_REGISTER_TYPE_UD)); 551 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1, 552 inst->offset); 553} 554 555void 556fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst) 557{ 558 assert(inst->mlen != 0); 559 560 /* Clear any post destination dependencies that would be ignored by 561 * the block read. See the B-Spec for pre-gen5 send instruction. 562 * 563 * This could use a better solution, since texture sampling and 564 * math reads could potentially run into it as well -- anywhere 565 * that we have a SEND with a destination that is a register that 566 * was written but not read within the last N instructions (what's 567 * N? unsure). This is rare because of dead code elimination, but 568 * not impossible. 569 */ 570 if (intel->gen == 4 && !intel->is_g4x) 571 brw_MOV(p, brw_null_reg(), dst); 572 573 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1, 574 inst->offset); 575 576 if (intel->gen == 4 && !intel->is_g4x) { 577 /* gen4 errata: destination from a send can't be used as a 578 * destination until it's been read. Just read it so we don't 579 * have to worry. 580 */ 581 brw_MOV(p, brw_null_reg(), dst); 582 } 583} 584 585void 586fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst, 587 struct brw_reg index, 588 struct brw_reg offset) 589{ 590 assert(inst->mlen != 0); 591 592 /* Clear any post destination dependencies that would be ignored by 593 * the block read. See the B-Spec for pre-gen5 send instruction. 594 * 595 * This could use a better solution, since texture sampling and 596 * math reads could potentially run into it as well -- anywhere 597 * that we have a SEND with a destination that is a register that 598 * was written but not read within the last N instructions (what's 599 * N? unsure). This is rare because of dead code elimination, but 600 * not impossible. 601 */ 602 if (intel->gen == 4 && !intel->is_g4x) 603 brw_MOV(p, brw_null_reg(), dst); 604 605 assert(index.file == BRW_IMMEDIATE_VALUE && 606 index.type == BRW_REGISTER_TYPE_UD); 607 uint32_t surf_index = index.dw1.ud; 608 609 assert(offset.file == BRW_IMMEDIATE_VALUE && 610 offset.type == BRW_REGISTER_TYPE_UD); 611 uint32_t read_offset = offset.dw1.ud; 612 613 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf), 614 read_offset, surf_index); 615 616 if (intel->gen == 4 && !intel->is_g4x) { 617 /* gen4 errata: destination from a send can't be used as a 618 * destination until it's been read. Just read it so we don't 619 * have to worry. 620 */ 621 brw_MOV(p, brw_null_reg(), dst); 622 } 623} 624 625 626/** 627 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred 628 * into the flags register (f0.0). 629 * 630 * Used only on Gen6 and above. 631 */ 632void 633fs_visitor::generate_mov_dispatch_to_flags() 634{ 635 struct brw_reg f0 = brw_flag_reg(); 636 struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); 637 638 assert (intel->gen >= 6); 639 brw_push_insn_state(p); 640 brw_set_mask_control(p, BRW_MASK_DISABLE); 641 brw_MOV(p, f0, g1); 642 brw_pop_insn_state(p); 643} 644 645 646static uint32_t brw_file_from_reg(fs_reg *reg) 647{ 648 switch (reg->file) { 649 case ARF: 650 return BRW_ARCHITECTURE_REGISTER_FILE; 651 case GRF: 652 return BRW_GENERAL_REGISTER_FILE; 653 case MRF: 654 return BRW_MESSAGE_REGISTER_FILE; 655 case IMM: 656 return BRW_IMMEDIATE_VALUE; 657 default: 658 assert(!"not reached"); 659 return BRW_GENERAL_REGISTER_FILE; 660 } 661} 662 663static struct brw_reg 664brw_reg_from_fs_reg(fs_reg *reg) 665{ 666 struct brw_reg brw_reg; 667 668 switch (reg->file) { 669 case GRF: 670 case ARF: 671 case MRF: 672 if (reg->smear == -1) { 673 brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0); 674 } else { 675 brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, reg->smear); 676 } 677 brw_reg = retype(brw_reg, reg->type); 678 if (reg->sechalf) 679 brw_reg = sechalf(brw_reg); 680 break; 681 case IMM: 682 switch (reg->type) { 683 case BRW_REGISTER_TYPE_F: 684 brw_reg = brw_imm_f(reg->imm.f); 685 break; 686 case BRW_REGISTER_TYPE_D: 687 brw_reg = brw_imm_d(reg->imm.i); 688 break; 689 case BRW_REGISTER_TYPE_UD: 690 brw_reg = brw_imm_ud(reg->imm.u); 691 break; 692 default: 693 assert(!"not reached"); 694 brw_reg = brw_null_reg(); 695 break; 696 } 697 break; 698 case FIXED_HW_REG: 699 brw_reg = reg->fixed_hw_reg; 700 break; 701 case BAD_FILE: 702 /* Probably unused. */ 703 brw_reg = brw_null_reg(); 704 break; 705 case UNIFORM: 706 assert(!"not reached"); 707 brw_reg = brw_null_reg(); 708 break; 709 default: 710 assert(!"not reached"); 711 brw_reg = brw_null_reg(); 712 break; 713 } 714 if (reg->abs) 715 brw_reg = brw_abs(brw_reg); 716 if (reg->negate) 717 brw_reg = negate(brw_reg); 718 719 return brw_reg; 720} 721 722void 723fs_visitor::generate_code() 724{ 725 int last_native_inst = p->nr_insn; 726 const char *last_annotation_string = NULL; 727 ir_instruction *last_annotation_ir = NULL; 728 729 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 730 printf("Native code for fragment shader %d (%d-wide dispatch):\n", 731 prog->Name, c->dispatch_width); 732 } 733 734 fs_cfg *cfg = NULL; 735 if (unlikely(INTEL_DEBUG & DEBUG_WM)) 736 cfg = new(mem_ctx) fs_cfg(this); 737 738 foreach_list(node, &this->instructions) { 739 fs_inst *inst = (fs_inst *)node; 740 struct brw_reg src[3], dst; 741 742 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 743 foreach_list(node, &cfg->block_list) { 744 fs_bblock_link *link = (fs_bblock_link *)node; 745 fs_bblock *block = link->block; 746 747 if (block->start == inst) { 748 printf(" START B%d", block->block_num); 749 foreach_list(predecessor_node, &block->parents) { 750 fs_bblock_link *predecessor_link = 751 (fs_bblock_link *)predecessor_node; 752 fs_bblock *predecessor_block = predecessor_link->block; 753 printf(" <-B%d", predecessor_block->block_num); 754 } 755 printf("\n"); 756 } 757 } 758 759 if (last_annotation_ir != inst->ir) { 760 last_annotation_ir = inst->ir; 761 if (last_annotation_ir) { 762 printf(" "); 763 last_annotation_ir->print(); 764 printf("\n"); 765 } 766 } 767 if (last_annotation_string != inst->annotation) { 768 last_annotation_string = inst->annotation; 769 if (last_annotation_string) 770 printf(" %s\n", last_annotation_string); 771 } 772 } 773 774 for (unsigned int i = 0; i < 3; i++) { 775 src[i] = brw_reg_from_fs_reg(&inst->src[i]); 776 777 /* The accumulator result appears to get used for the 778 * conditional modifier generation. When negating a UD 779 * value, there is a 33rd bit generated for the sign in the 780 * accumulator value, so now you can't check, for example, 781 * equality with a 32-bit value. See piglit fs-op-neg-uvec4. 782 */ 783 assert(!inst->conditional_mod || 784 inst->src[i].type != BRW_REGISTER_TYPE_UD || 785 !inst->src[i].negate); 786 } 787 dst = brw_reg_from_fs_reg(&inst->dst); 788 789 brw_set_conditionalmod(p, inst->conditional_mod); 790 brw_set_predicate_control(p, inst->predicated); 791 brw_set_predicate_inverse(p, inst->predicate_inverse); 792 brw_set_saturate(p, inst->saturate); 793 794 if (inst->force_uncompressed || c->dispatch_width == 8) { 795 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 796 } else if (inst->force_sechalf) { 797 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); 798 } else { 799 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 800 } 801 802 switch (inst->opcode) { 803 case BRW_OPCODE_MOV: 804 brw_MOV(p, dst, src[0]); 805 break; 806 case BRW_OPCODE_ADD: 807 brw_ADD(p, dst, src[0], src[1]); 808 break; 809 case BRW_OPCODE_MUL: 810 brw_MUL(p, dst, src[0], src[1]); 811 break; 812 case BRW_OPCODE_MACH: 813 brw_set_acc_write_control(p, 1); 814 brw_MACH(p, dst, src[0], src[1]); 815 brw_set_acc_write_control(p, 0); 816 break; 817 818 case BRW_OPCODE_MAD: 819 brw_set_access_mode(p, BRW_ALIGN_16); 820 if (c->dispatch_width == 16) { 821 brw_set_compression_control(p, BRW_COMPRESSION_NONE); 822 brw_MAD(p, dst, src[0], src[1], src[2]); 823 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF); 824 brw_MAD(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2])); 825 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); 826 } else { 827 brw_MAD(p, dst, src[0], src[1], src[2]); 828 } 829 brw_set_access_mode(p, BRW_ALIGN_1); 830 break; 831 832 case BRW_OPCODE_FRC: 833 brw_FRC(p, dst, src[0]); 834 break; 835 case BRW_OPCODE_RNDD: 836 brw_RNDD(p, dst, src[0]); 837 break; 838 case BRW_OPCODE_RNDE: 839 brw_RNDE(p, dst, src[0]); 840 break; 841 case BRW_OPCODE_RNDZ: 842 brw_RNDZ(p, dst, src[0]); 843 break; 844 845 case BRW_OPCODE_AND: 846 brw_AND(p, dst, src[0], src[1]); 847 break; 848 case BRW_OPCODE_OR: 849 brw_OR(p, dst, src[0], src[1]); 850 break; 851 case BRW_OPCODE_XOR: 852 brw_XOR(p, dst, src[0], src[1]); 853 break; 854 case BRW_OPCODE_NOT: 855 brw_NOT(p, dst, src[0]); 856 break; 857 case BRW_OPCODE_ASR: 858 brw_ASR(p, dst, src[0], src[1]); 859 break; 860 case BRW_OPCODE_SHR: 861 brw_SHR(p, dst, src[0], src[1]); 862 break; 863 case BRW_OPCODE_SHL: 864 brw_SHL(p, dst, src[0], src[1]); 865 break; 866 867 case BRW_OPCODE_CMP: 868 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); 869 break; 870 case BRW_OPCODE_SEL: 871 brw_SEL(p, dst, src[0], src[1]); 872 break; 873 874 case BRW_OPCODE_IF: 875 if (inst->src[0].file != BAD_FILE) { 876 /* The instruction has an embedded compare (only allowed on gen6) */ 877 assert(intel->gen == 6); 878 gen6_IF(p, inst->conditional_mod, src[0], src[1]); 879 } else { 880 brw_IF(p, c->dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8); 881 } 882 break; 883 884 case BRW_OPCODE_ELSE: 885 brw_ELSE(p); 886 break; 887 case BRW_OPCODE_ENDIF: 888 brw_ENDIF(p); 889 break; 890 891 case BRW_OPCODE_DO: 892 brw_DO(p, BRW_EXECUTE_8); 893 break; 894 895 case BRW_OPCODE_BREAK: 896 brw_BREAK(p); 897 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 898 break; 899 case BRW_OPCODE_CONTINUE: 900 /* FINISHME: We need to write the loop instruction support still. */ 901 if (intel->gen >= 6) 902 gen6_CONT(p); 903 else 904 brw_CONT(p); 905 brw_set_predicate_control(p, BRW_PREDICATE_NONE); 906 break; 907 908 case BRW_OPCODE_WHILE: 909 brw_WHILE(p); 910 break; 911 912 case SHADER_OPCODE_RCP: 913 case SHADER_OPCODE_RSQ: 914 case SHADER_OPCODE_SQRT: 915 case SHADER_OPCODE_EXP2: 916 case SHADER_OPCODE_LOG2: 917 case SHADER_OPCODE_SIN: 918 case SHADER_OPCODE_COS: 919 if (intel->gen >= 7) { 920 generate_math1_gen7(inst, dst, src[0]); 921 } else if (intel->gen == 6) { 922 generate_math1_gen6(inst, dst, src[0]); 923 } else { 924 generate_math_gen4(inst, dst, src[0]); 925 } 926 break; 927 case SHADER_OPCODE_INT_QUOTIENT: 928 case SHADER_OPCODE_INT_REMAINDER: 929 case SHADER_OPCODE_POW: 930 if (intel->gen >= 7) { 931 generate_math2_gen7(inst, dst, src[0], src[1]); 932 } else if (intel->gen == 6) { 933 generate_math2_gen6(inst, dst, src[0], src[1]); 934 } else { 935 generate_math_gen4(inst, dst, src[0]); 936 } 937 break; 938 case FS_OPCODE_PIXEL_X: 939 generate_pixel_xy(dst, true); 940 break; 941 case FS_OPCODE_PIXEL_Y: 942 generate_pixel_xy(dst, false); 943 break; 944 case FS_OPCODE_CINTERP: 945 brw_MOV(p, dst, src[0]); 946 break; 947 case FS_OPCODE_LINTERP: 948 generate_linterp(inst, dst, src); 949 break; 950 case SHADER_OPCODE_TEX: 951 case FS_OPCODE_TXB: 952 case SHADER_OPCODE_TXD: 953 case SHADER_OPCODE_TXF: 954 case SHADER_OPCODE_TXL: 955 case SHADER_OPCODE_TXS: 956 generate_tex(inst, dst, src[0]); 957 break; 958 case FS_OPCODE_DISCARD: 959 generate_discard(inst); 960 break; 961 case FS_OPCODE_DDX: 962 generate_ddx(inst, dst, src[0]); 963 break; 964 case FS_OPCODE_DDY: 965 /* Make sure fp->UsesDFdy flag got set (otherwise there's no 966 * guarantee that c->key.render_to_fbo is set). 967 */ 968 assert(fp->UsesDFdy); 969 generate_ddy(inst, dst, src[0], c->key.render_to_fbo); 970 break; 971 972 case FS_OPCODE_SPILL: 973 generate_spill(inst, src[0]); 974 break; 975 976 case FS_OPCODE_UNSPILL: 977 generate_unspill(inst, dst); 978 break; 979 980 case FS_OPCODE_PULL_CONSTANT_LOAD: 981 generate_pull_constant_load(inst, dst, src[0], src[1]); 982 break; 983 984 case FS_OPCODE_FB_WRITE: 985 generate_fb_write(inst); 986 break; 987 988 case FS_OPCODE_MOV_DISPATCH_TO_FLAGS: 989 generate_mov_dispatch_to_flags(); 990 break; 991 992 default: 993 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) { 994 _mesa_problem(ctx, "Unsupported opcode `%s' in FS", 995 brw_opcodes[inst->opcode].name); 996 } else { 997 _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode); 998 } 999 fail("unsupported opcode in FS\n"); 1000 } 1001 1002 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 1003 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) { 1004 if (0) { 1005 printf("0x%08x 0x%08x 0x%08x 0x%08x ", 1006 ((uint32_t *)&p->store[i])[3], 1007 ((uint32_t *)&p->store[i])[2], 1008 ((uint32_t *)&p->store[i])[1], 1009 ((uint32_t *)&p->store[i])[0]); 1010 } 1011 brw_disasm(stdout, &p->store[i], intel->gen); 1012 } 1013 1014 foreach_list(node, &cfg->block_list) { 1015 fs_bblock_link *link = (fs_bblock_link *)node; 1016 fs_bblock *block = link->block; 1017 1018 if (block->end == inst) { 1019 printf(" END B%d", block->block_num); 1020 foreach_list(successor_node, &block->children) { 1021 fs_bblock_link *successor_link = 1022 (fs_bblock_link *)successor_node; 1023 fs_bblock *successor_block = successor_link->block; 1024 printf(" ->B%d", successor_block->block_num); 1025 } 1026 printf("\n"); 1027 } 1028 } 1029 } 1030 1031 last_native_inst = p->nr_insn; 1032 } 1033 1034 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 1035 printf("\n"); 1036 } 1037 1038 brw_set_uip_jip(p); 1039 1040 /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS 1041 * emit issues, it doesn't get the jump distances into the output, 1042 * which is often something we want to debug. So this is here in 1043 * case you're doing that. 1044 */ 1045 if (0) { 1046 if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 1047 for (unsigned int i = 0; i < p->nr_insn; i++) { 1048 printf("0x%08x 0x%08x 0x%08x 0x%08x ", 1049 ((uint32_t *)&p->store[i])[3], 1050 ((uint32_t *)&p->store[i])[2], 1051 ((uint32_t *)&p->store[i])[1], 1052 ((uint32_t *)&p->store[i])[0]); 1053 brw_disasm(stdout, &p->store[i], intel->gen); 1054 } 1055 } 1056 } 1057} 1058