brw_eu_emit.c revision 8873120f9fb0c82cfd46cd15c39e66c38076cb0d
1/* 2 Copyright (C) Intel Corp. 2006. All Rights Reserved. 3 Intel funded Tungsten Graphics to 4 develop this 3D driver. 5 6 Permission is hereby granted, free of charge, to any person obtaining 7 a copy of this software and associated documentation files (the 8 "Software"), to deal in the Software without restriction, including 9 without limitation the rights to use, copy, modify, merge, publish, 10 distribute, sublicense, and/or sell copies of the Software, and to 11 permit persons to whom the Software is furnished to do so, subject to 12 the following conditions: 13 14 The above copyright notice and this permission notice (including the 15 next paragraph) shall be included in all copies or substantial 16 portions of the Software. 17 18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE 22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 26 **********************************************************************/ 27 /* 28 * Authors: 29 * Keith Whitwell <keithw@vmware.com> 30 */ 31 32 33#include "brw_context.h" 34#include "brw_defines.h" 35#include "brw_eu.h" 36 37#include "glsl/ralloc.h" 38 39/*********************************************************************** 40 * Internal helper for constructing instructions 41 */ 42 43static void guess_execution_size(struct brw_compile *p, 44 struct brw_instruction *insn, 45 struct brw_reg reg) 46{ 47 if (reg.width == BRW_WIDTH_8 && p->compressed) 48 insn->header.execution_size = BRW_EXECUTE_16; 49 else 50 insn->header.execution_size = reg.width; /* note - definitions are compatible */ 51} 52 53 54/** 55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source 56 * registers, implicitly moving the operand to a message register. 57 * 58 * On Sandybridge, this is no longer the case. This function performs the 59 * explicit move; it should be called before emitting a SEND instruction. 60 */ 61void 62gen6_resolve_implied_move(struct brw_compile *p, 63 struct brw_reg *src, 64 unsigned msg_reg_nr) 65{ 66 struct brw_context *brw = p->brw; 67 if (brw->gen < 6) 68 return; 69 70 if (src->file == BRW_MESSAGE_REGISTER_FILE) 71 return; 72 73 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) { 74 brw_push_insn_state(p); 75 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 76 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 77 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD), 78 retype(*src, BRW_REGISTER_TYPE_UD)); 79 brw_pop_insn_state(p); 80 } 81 *src = brw_message_reg(msg_reg_nr); 82} 83 84static void 85gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg) 86{ 87 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"): 88 * "The send with EOT should use register space R112-R127 for <src>. This is 89 * to enable loading of a new thread into the same slot while the message 90 * with EOT for current thread is pending dispatch." 91 * 92 * Since we're pretending to have 16 MRFs anyway, we may as well use the 93 * registers required for messages with EOT. 94 */ 95 struct brw_context *brw = p->brw; 96 if (brw->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) { 97 reg->file = BRW_GENERAL_REGISTER_FILE; 98 reg->nr += GEN7_MRF_HACK_START; 99 } 100} 101 102/** 103 * Convert a brw_reg_type enumeration value into the hardware representation. 104 * 105 * The hardware encoding may depend on whether the value is an immediate. 106 */ 107unsigned 108brw_reg_type_to_hw_type(const struct brw_context *brw, 109 enum brw_reg_type type, unsigned file) 110{ 111 if (file == BRW_IMMEDIATE_VALUE) { 112 const static int imm_hw_types[] = { 113 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD, 114 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D, 115 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW, 116 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W, 117 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F, 118 [BRW_REGISTER_TYPE_UB] = -1, 119 [BRW_REGISTER_TYPE_B] = -1, 120 [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV, 121 [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF, 122 [BRW_REGISTER_TYPE_V] = BRW_HW_REG_IMM_TYPE_V, 123 [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF, 124 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF, 125 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ, 126 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q, 127 }; 128 assert(type < ARRAY_SIZE(imm_hw_types)); 129 assert(imm_hw_types[type] != -1); 130 assert(brw->gen >= 8 || type < BRW_REGISTER_TYPE_DF); 131 return imm_hw_types[type]; 132 } else { 133 /* Non-immediate registers */ 134 const static int hw_types[] = { 135 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD, 136 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D, 137 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW, 138 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W, 139 [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB, 140 [BRW_REGISTER_TYPE_B] = BRW_HW_REG_NON_IMM_TYPE_B, 141 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F, 142 [BRW_REGISTER_TYPE_UV] = -1, 143 [BRW_REGISTER_TYPE_VF] = -1, 144 [BRW_REGISTER_TYPE_V] = -1, 145 [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF, 146 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF, 147 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ, 148 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q, 149 }; 150 assert(type < ARRAY_SIZE(hw_types)); 151 assert(hw_types[type] != -1); 152 assert(brw->gen >= 7 || type < BRW_REGISTER_TYPE_DF); 153 assert(brw->gen >= 8 || type < BRW_REGISTER_TYPE_HF); 154 return hw_types[type]; 155 } 156} 157 158void 159brw_set_dest(struct brw_compile *p, struct brw_instruction *insn, 160 struct brw_reg dest) 161{ 162 if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE && 163 dest.file != BRW_MESSAGE_REGISTER_FILE) 164 assert(dest.nr < 128); 165 166 gen7_convert_mrf_to_grf(p, &dest); 167 168 insn->bits1.da1.dest_reg_file = dest.file; 169 insn->bits1.da1.dest_reg_type = 170 brw_reg_type_to_hw_type(p->brw, dest.type, dest.file); 171 insn->bits1.da1.dest_address_mode = dest.address_mode; 172 173 if (dest.address_mode == BRW_ADDRESS_DIRECT) { 174 insn->bits1.da1.dest_reg_nr = dest.nr; 175 176 if (insn->header.access_mode == BRW_ALIGN_1) { 177 insn->bits1.da1.dest_subreg_nr = dest.subnr; 178 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0) 179 dest.hstride = BRW_HORIZONTAL_STRIDE_1; 180 insn->bits1.da1.dest_horiz_stride = dest.hstride; 181 } else { 182 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16; 183 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask; 184 if (dest.file == BRW_GENERAL_REGISTER_FILE || 185 dest.file == BRW_MESSAGE_REGISTER_FILE) { 186 assert(dest.dw1.bits.writemask != 0); 187 } 188 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1: 189 * Although Dst.HorzStride is a don't care for Align16, HW needs 190 * this to be programmed as "01". 191 */ 192 insn->bits1.da16.dest_horiz_stride = 1; 193 } 194 } else { 195 insn->bits1.ia1.dest_subreg_nr = dest.subnr; 196 197 /* These are different sizes in align1 vs align16: 198 */ 199 if (insn->header.access_mode == BRW_ALIGN_1) { 200 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset; 201 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0) 202 dest.hstride = BRW_HORIZONTAL_STRIDE_1; 203 insn->bits1.ia1.dest_horiz_stride = dest.hstride; 204 } else { 205 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset; 206 /* even ignored in da16, still need to set as '01' */ 207 insn->bits1.ia16.dest_horiz_stride = 1; 208 } 209 } 210 211 /* NEW: Set the execution size based on dest.width and 212 * insn->compression_control: 213 */ 214 guess_execution_size(p, insn, dest); 215} 216 217extern int reg_type_size[]; 218 219static void 220validate_reg(struct brw_instruction *insn, struct brw_reg reg) 221{ 222 int hstride_for_reg[] = {0, 1, 2, 4}; 223 int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256}; 224 int width_for_reg[] = {1, 2, 4, 8, 16}; 225 int execsize_for_reg[] = {1, 2, 4, 8, 16}; 226 int width, hstride, vstride, execsize; 227 228 if (reg.file == BRW_IMMEDIATE_VALUE) { 229 /* 3.3.6: Region Parameters. Restriction: Immediate vectors 230 * mean the destination has to be 128-bit aligned and the 231 * destination horiz stride has to be a word. 232 */ 233 if (reg.type == BRW_REGISTER_TYPE_V) { 234 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] * 235 reg_type_size[insn->bits1.da1.dest_reg_type] == 2); 236 } 237 238 return; 239 } 240 241 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE && 242 reg.file == BRW_ARF_NULL) 243 return; 244 245 assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg)); 246 hstride = hstride_for_reg[reg.hstride]; 247 248 if (reg.vstride == 0xf) { 249 vstride = -1; 250 } else { 251 assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg)); 252 vstride = vstride_for_reg[reg.vstride]; 253 } 254 255 assert(reg.width >= 0 && reg.width < Elements(width_for_reg)); 256 width = width_for_reg[reg.width]; 257 258 assert(insn->header.execution_size >= 0 && 259 insn->header.execution_size < Elements(execsize_for_reg)); 260 execsize = execsize_for_reg[insn->header.execution_size]; 261 262 /* Restrictions from 3.3.10: Register Region Restrictions. */ 263 /* 3. */ 264 assert(execsize >= width); 265 266 /* 4. */ 267 if (execsize == width && hstride != 0) { 268 assert(vstride == -1 || vstride == width * hstride); 269 } 270 271 /* 5. */ 272 if (execsize == width && hstride == 0) { 273 /* no restriction on vstride. */ 274 } 275 276 /* 6. */ 277 if (width == 1) { 278 assert(hstride == 0); 279 } 280 281 /* 7. */ 282 if (execsize == 1 && width == 1) { 283 assert(hstride == 0); 284 assert(vstride == 0); 285 } 286 287 /* 8. */ 288 if (vstride == 0 && hstride == 0) { 289 assert(width == 1); 290 } 291 292 /* 10. Check destination issues. */ 293} 294 295static bool 296is_compactable_immediate(unsigned imm) 297{ 298 /* We get the low 12 bits as-is. */ 299 imm &= ~0xfff; 300 301 /* We get one bit replicated through the top 20 bits. */ 302 return imm == 0 || imm == 0xfffff000; 303} 304 305void 306brw_set_src0(struct brw_compile *p, struct brw_instruction *insn, 307 struct brw_reg reg) 308{ 309 struct brw_context *brw = p->brw; 310 311 if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE) 312 assert(reg.nr < 128); 313 314 gen7_convert_mrf_to_grf(p, ®); 315 316 if (brw->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND || 317 insn->header.opcode == BRW_OPCODE_SENDC)) { 318 /* Any source modifiers or regions will be ignored, since this just 319 * identifies the MRF/GRF to start reading the message contents from. 320 * Check for some likely failures. 321 */ 322 assert(!reg.negate); 323 assert(!reg.abs); 324 assert(reg.address_mode == BRW_ADDRESS_DIRECT); 325 } 326 327 validate_reg(insn, reg); 328 329 insn->bits1.da1.src0_reg_file = reg.file; 330 insn->bits1.da1.src0_reg_type = 331 brw_reg_type_to_hw_type(brw, reg.type, reg.file); 332 insn->bits2.da1.src0_abs = reg.abs; 333 insn->bits2.da1.src0_negate = reg.negate; 334 insn->bits2.da1.src0_address_mode = reg.address_mode; 335 336 if (reg.file == BRW_IMMEDIATE_VALUE) { 337 insn->bits3.ud = reg.dw1.ud; 338 339 /* The Bspec's section titled "Non-present Operands" claims that if src0 340 * is an immediate that src1's type must be the same as that of src0. 341 * 342 * The SNB+ DataTypeIndex instruction compaction tables contain mappings 343 * that do not follow this rule. E.g., from the IVB/HSW table: 344 * 345 * DataTypeIndex 18-Bit Mapping Mapped Meaning 346 * 3 001000001011111101 r:f | i:vf | a:ud | <1> | dir | 347 * 348 * And from the SNB table: 349 * 350 * DataTypeIndex 18-Bit Mapping Mapped Meaning 351 * 8 001000000111101100 a:w | i:w | a:ud | <1> | dir | 352 * 353 * Neither of these cause warnings from the simulator when used, 354 * compacted or otherwise. In fact, all compaction mappings that have an 355 * immediate in src0 use a:ud for src1. 356 * 357 * The GM45 instruction compaction tables do not contain mapped meanings 358 * so it's not clear whether it has the restriction. We'll assume it was 359 * lifted on SNB. (FINISHME: decode the GM45 tables and check.) 360 */ 361 insn->bits1.da1.src1_reg_file = 0; /* arf */ 362 if (brw->gen < 6) { 363 insn->bits1.da1.src1_reg_type = insn->bits1.da1.src0_reg_type; 364 } else { 365 insn->bits1.da1.src1_reg_type = BRW_HW_REG_TYPE_UD; 366 } 367 368 /* Compacted instructions only have 12-bits (plus 1 for the other 20) 369 * for immediate values. Presumably the hardware engineers realized 370 * that the only useful floating-point value that could be represented 371 * in this format is 0.0, which can also be represented as a VF-typed 372 * immediate, so they gave us the previously mentioned mapping on IVB+. 373 * 374 * Strangely, we do have a mapping for imm:f in src1, so we don't need 375 * to do this there. 376 * 377 * If we see a 0.0:F, change the type to VF so that it can be compacted. 378 */ 379 if (insn->bits3.ud == 0x0 && 380 insn->bits1.da1.src0_reg_type == BRW_HW_REG_TYPE_F) { 381 insn->bits1.da1.src0_reg_type = BRW_HW_REG_IMM_TYPE_VF; 382 } 383 384 /* There are no mappings for dst:d | i:d, so if the immediate is suitable 385 * set the types to :UD so the instruction can be compacted. 386 */ 387 if (is_compactable_immediate(insn->bits3.ud) && 388 insn->header.destreg__conditionalmod == BRW_CONDITIONAL_NONE && 389 insn->bits1.da1.src0_reg_type == BRW_HW_REG_TYPE_D && 390 insn->bits1.da1.dest_reg_type == BRW_HW_REG_TYPE_D) { 391 insn->bits1.da1.src0_reg_type = BRW_HW_REG_TYPE_UD; 392 insn->bits1.da1.dest_reg_type = BRW_HW_REG_TYPE_UD; 393 } 394 } else { 395 if (reg.address_mode == BRW_ADDRESS_DIRECT) { 396 if (insn->header.access_mode == BRW_ALIGN_1) { 397 insn->bits2.da1.src0_subreg_nr = reg.subnr; 398 insn->bits2.da1.src0_reg_nr = reg.nr; 399 } else { 400 insn->bits2.da16.src0_subreg_nr = reg.subnr / 16; 401 insn->bits2.da16.src0_reg_nr = reg.nr; 402 } 403 } else { 404 insn->bits2.ia1.src0_subreg_nr = reg.subnr; 405 406 if (insn->header.access_mode == BRW_ALIGN_1) { 407 insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset; 408 } else { 409 insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset; 410 } 411 } 412 413 if (insn->header.access_mode == BRW_ALIGN_1) { 414 if (reg.width == BRW_WIDTH_1 && 415 insn->header.execution_size == BRW_EXECUTE_1) { 416 insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0; 417 insn->bits2.da1.src0_width = BRW_WIDTH_1; 418 insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0; 419 } else { 420 insn->bits2.da1.src0_horiz_stride = reg.hstride; 421 insn->bits2.da1.src0_width = reg.width; 422 insn->bits2.da1.src0_vert_stride = reg.vstride; 423 } 424 } else { 425 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X); 426 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y); 427 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z); 428 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W); 429 430 /* This is an oddity of the fact we're using the same 431 * descriptions for registers in align_16 as align_1: 432 */ 433 if (reg.vstride == BRW_VERTICAL_STRIDE_8) 434 insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4; 435 else 436 insn->bits2.da16.src0_vert_stride = reg.vstride; 437 } 438 } 439} 440 441 442void 443brw_set_src1(struct brw_compile *p, 444 struct brw_instruction *insn, 445 struct brw_reg reg) 446{ 447 assert(reg.file != BRW_MESSAGE_REGISTER_FILE); 448 449 if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE) 450 assert(reg.nr < 128); 451 452 gen7_convert_mrf_to_grf(p, ®); 453 454 validate_reg(insn, reg); 455 456 insn->bits1.da1.src1_reg_file = reg.file; 457 insn->bits1.da1.src1_reg_type = 458 brw_reg_type_to_hw_type(p->brw, reg.type, reg.file); 459 insn->bits3.da1.src1_abs = reg.abs; 460 insn->bits3.da1.src1_negate = reg.negate; 461 462 /* Only src1 can be immediate in two-argument instructions. 463 */ 464 assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE); 465 466 if (reg.file == BRW_IMMEDIATE_VALUE) { 467 insn->bits3.ud = reg.dw1.ud; 468 } else { 469 /* This is a hardware restriction, which may or may not be lifted 470 * in the future: 471 */ 472 assert (reg.address_mode == BRW_ADDRESS_DIRECT); 473 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */ 474 475 if (insn->header.access_mode == BRW_ALIGN_1) { 476 insn->bits3.da1.src1_subreg_nr = reg.subnr; 477 insn->bits3.da1.src1_reg_nr = reg.nr; 478 } else { 479 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16; 480 insn->bits3.da16.src1_reg_nr = reg.nr; 481 } 482 483 if (insn->header.access_mode == BRW_ALIGN_1) { 484 if (reg.width == BRW_WIDTH_1 && 485 insn->header.execution_size == BRW_EXECUTE_1) { 486 insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0; 487 insn->bits3.da1.src1_width = BRW_WIDTH_1; 488 insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0; 489 } else { 490 insn->bits3.da1.src1_horiz_stride = reg.hstride; 491 insn->bits3.da1.src1_width = reg.width; 492 insn->bits3.da1.src1_vert_stride = reg.vstride; 493 } 494 } else { 495 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X); 496 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y); 497 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z); 498 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W); 499 500 /* This is an oddity of the fact we're using the same 501 * descriptions for registers in align_16 as align_1: 502 */ 503 if (reg.vstride == BRW_VERTICAL_STRIDE_8) 504 insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4; 505 else 506 insn->bits3.da16.src1_vert_stride = reg.vstride; 507 } 508 } 509} 510 511/** 512 * Set the Message Descriptor and Extended Message Descriptor fields 513 * for SEND messages. 514 * 515 * \note This zeroes out the Function Control bits, so it must be called 516 * \b before filling out any message-specific data. Callers can 517 * choose not to fill in irrelevant bits; they will be zero. 518 */ 519static void 520brw_set_message_descriptor(struct brw_compile *p, 521 struct brw_instruction *inst, 522 enum brw_message_target sfid, 523 unsigned msg_length, 524 unsigned response_length, 525 bool header_present, 526 bool end_of_thread) 527{ 528 struct brw_context *brw = p->brw; 529 530 brw_set_src1(p, inst, brw_imm_d(0)); 531 532 if (brw->gen >= 5) { 533 inst->bits3.generic_gen5.header_present = header_present; 534 inst->bits3.generic_gen5.response_length = response_length; 535 inst->bits3.generic_gen5.msg_length = msg_length; 536 inst->bits3.generic_gen5.end_of_thread = end_of_thread; 537 538 if (brw->gen >= 6) { 539 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */ 540 inst->header.destreg__conditionalmod = sfid; 541 } else { 542 /* Set Extended Message Descriptor (ex_desc) */ 543 inst->bits2.send_gen5.sfid = sfid; 544 inst->bits2.send_gen5.end_of_thread = end_of_thread; 545 } 546 } else { 547 inst->bits3.generic.response_length = response_length; 548 inst->bits3.generic.msg_length = msg_length; 549 inst->bits3.generic.msg_target = sfid; 550 inst->bits3.generic.end_of_thread = end_of_thread; 551 } 552} 553 554static void brw_set_math_message( struct brw_compile *p, 555 struct brw_instruction *insn, 556 unsigned function, 557 unsigned integer_type, 558 bool low_precision, 559 unsigned dataType ) 560{ 561 struct brw_context *brw = p->brw; 562 unsigned msg_length; 563 unsigned response_length; 564 565 /* Infer message length from the function */ 566 switch (function) { 567 case BRW_MATH_FUNCTION_POW: 568 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT: 569 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER: 570 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER: 571 msg_length = 2; 572 break; 573 default: 574 msg_length = 1; 575 break; 576 } 577 578 /* Infer response length from the function */ 579 switch (function) { 580 case BRW_MATH_FUNCTION_SINCOS: 581 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER: 582 response_length = 2; 583 break; 584 default: 585 response_length = 1; 586 break; 587 } 588 589 590 brw_set_message_descriptor(p, insn, BRW_SFID_MATH, 591 msg_length, response_length, false, false); 592 if (brw->gen == 5) { 593 insn->bits3.math_gen5.function = function; 594 insn->bits3.math_gen5.int_type = integer_type; 595 insn->bits3.math_gen5.precision = low_precision; 596 insn->bits3.math_gen5.saturate = insn->header.saturate; 597 insn->bits3.math_gen5.data_type = dataType; 598 insn->bits3.math_gen5.snapshot = 0; 599 } else { 600 insn->bits3.math.function = function; 601 insn->bits3.math.int_type = integer_type; 602 insn->bits3.math.precision = low_precision; 603 insn->bits3.math.saturate = insn->header.saturate; 604 insn->bits3.math.data_type = dataType; 605 } 606 insn->header.saturate = 0; 607} 608 609 610static void brw_set_ff_sync_message(struct brw_compile *p, 611 struct brw_instruction *insn, 612 bool allocate, 613 unsigned response_length, 614 bool end_of_thread) 615{ 616 brw_set_message_descriptor(p, insn, BRW_SFID_URB, 617 1, response_length, true, end_of_thread); 618 insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */ 619 insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */ 620 insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */ 621 insn->bits3.urb_gen5.allocate = allocate; 622 insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */ 623 insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */ 624} 625 626static void brw_set_urb_message( struct brw_compile *p, 627 struct brw_instruction *insn, 628 enum brw_urb_write_flags flags, 629 unsigned msg_length, 630 unsigned response_length, 631 unsigned offset, 632 unsigned swizzle_control ) 633{ 634 struct brw_context *brw = p->brw; 635 636 brw_set_message_descriptor(p, insn, BRW_SFID_URB, 637 msg_length, response_length, true, 638 flags & BRW_URB_WRITE_EOT); 639 if (brw->gen == 7) { 640 if (flags & BRW_URB_WRITE_OWORD) { 641 assert(msg_length == 2); /* header + one OWORD of data */ 642 insn->bits3.urb_gen7.opcode = BRW_URB_OPCODE_WRITE_OWORD; 643 } else { 644 insn->bits3.urb_gen7.opcode = BRW_URB_OPCODE_WRITE_HWORD; 645 } 646 insn->bits3.urb_gen7.offset = offset; 647 assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE); 648 insn->bits3.urb_gen7.swizzle_control = swizzle_control; 649 insn->bits3.urb_gen7.per_slot_offset = 650 flags & BRW_URB_WRITE_PER_SLOT_OFFSET ? 1 : 0; 651 insn->bits3.urb_gen7.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0; 652 } else if (brw->gen >= 5) { 653 insn->bits3.urb_gen5.opcode = 0; /* URB_WRITE */ 654 insn->bits3.urb_gen5.offset = offset; 655 insn->bits3.urb_gen5.swizzle_control = swizzle_control; 656 insn->bits3.urb_gen5.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0; 657 insn->bits3.urb_gen5.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1; 658 insn->bits3.urb_gen5.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0; 659 } else { 660 insn->bits3.urb.opcode = 0; /* ? */ 661 insn->bits3.urb.offset = offset; 662 insn->bits3.urb.swizzle_control = swizzle_control; 663 insn->bits3.urb.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0; 664 insn->bits3.urb.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1; 665 insn->bits3.urb.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0; 666 } 667} 668 669void 670brw_set_dp_write_message(struct brw_compile *p, 671 struct brw_instruction *insn, 672 unsigned binding_table_index, 673 unsigned msg_control, 674 unsigned msg_type, 675 unsigned msg_length, 676 bool header_present, 677 unsigned last_render_target, 678 unsigned response_length, 679 unsigned end_of_thread, 680 unsigned send_commit_msg) 681{ 682 struct brw_context *brw = p->brw; 683 unsigned sfid; 684 685 if (brw->gen >= 7) { 686 /* Use the Render Cache for RT writes; otherwise use the Data Cache */ 687 if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE) 688 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE; 689 else 690 sfid = GEN7_SFID_DATAPORT_DATA_CACHE; 691 } else if (brw->gen == 6) { 692 /* Use the render cache for all write messages. */ 693 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE; 694 } else { 695 sfid = BRW_SFID_DATAPORT_WRITE; 696 } 697 698 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length, 699 header_present, end_of_thread); 700 701 if (brw->gen >= 7) { 702 insn->bits3.gen7_dp.binding_table_index = binding_table_index; 703 insn->bits3.gen7_dp.msg_control = msg_control; 704 insn->bits3.gen7_dp.last_render_target = last_render_target; 705 insn->bits3.gen7_dp.msg_type = msg_type; 706 } else if (brw->gen == 6) { 707 insn->bits3.gen6_dp.binding_table_index = binding_table_index; 708 insn->bits3.gen6_dp.msg_control = msg_control; 709 insn->bits3.gen6_dp.last_render_target = last_render_target; 710 insn->bits3.gen6_dp.msg_type = msg_type; 711 insn->bits3.gen6_dp.send_commit_msg = send_commit_msg; 712 } else if (brw->gen == 5) { 713 insn->bits3.dp_write_gen5.binding_table_index = binding_table_index; 714 insn->bits3.dp_write_gen5.msg_control = msg_control; 715 insn->bits3.dp_write_gen5.last_render_target = last_render_target; 716 insn->bits3.dp_write_gen5.msg_type = msg_type; 717 insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg; 718 } else { 719 insn->bits3.dp_write.binding_table_index = binding_table_index; 720 insn->bits3.dp_write.msg_control = msg_control; 721 insn->bits3.dp_write.last_render_target = last_render_target; 722 insn->bits3.dp_write.msg_type = msg_type; 723 insn->bits3.dp_write.send_commit_msg = send_commit_msg; 724 } 725} 726 727void 728brw_set_dp_read_message(struct brw_compile *p, 729 struct brw_instruction *insn, 730 unsigned binding_table_index, 731 unsigned msg_control, 732 unsigned msg_type, 733 unsigned target_cache, 734 unsigned msg_length, 735 bool header_present, 736 unsigned response_length) 737{ 738 struct brw_context *brw = p->brw; 739 unsigned sfid; 740 741 if (brw->gen >= 7) { 742 sfid = GEN7_SFID_DATAPORT_DATA_CACHE; 743 } else if (brw->gen == 6) { 744 if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE) 745 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE; 746 else 747 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE; 748 } else { 749 sfid = BRW_SFID_DATAPORT_READ; 750 } 751 752 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length, 753 header_present, false); 754 755 if (brw->gen >= 7) { 756 insn->bits3.gen7_dp.binding_table_index = binding_table_index; 757 insn->bits3.gen7_dp.msg_control = msg_control; 758 insn->bits3.gen7_dp.last_render_target = 0; 759 insn->bits3.gen7_dp.msg_type = msg_type; 760 } else if (brw->gen == 6) { 761 insn->bits3.gen6_dp.binding_table_index = binding_table_index; 762 insn->bits3.gen6_dp.msg_control = msg_control; 763 insn->bits3.gen6_dp.last_render_target = 0; 764 insn->bits3.gen6_dp.msg_type = msg_type; 765 insn->bits3.gen6_dp.send_commit_msg = 0; 766 } else if (brw->gen == 5) { 767 insn->bits3.dp_read_gen5.binding_table_index = binding_table_index; 768 insn->bits3.dp_read_gen5.msg_control = msg_control; 769 insn->bits3.dp_read_gen5.msg_type = msg_type; 770 insn->bits3.dp_read_gen5.target_cache = target_cache; 771 } else if (brw->is_g4x) { 772 insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/ 773 insn->bits3.dp_read_g4x.msg_control = msg_control; /*8:10*/ 774 insn->bits3.dp_read_g4x.msg_type = msg_type; /*11:13*/ 775 insn->bits3.dp_read_g4x.target_cache = target_cache; /*14:15*/ 776 } else { 777 insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/ 778 insn->bits3.dp_read.msg_control = msg_control; /*8:11*/ 779 insn->bits3.dp_read.msg_type = msg_type; /*12:13*/ 780 insn->bits3.dp_read.target_cache = target_cache; /*14:15*/ 781 } 782} 783 784void 785brw_set_sampler_message(struct brw_compile *p, 786 struct brw_instruction *insn, 787 unsigned binding_table_index, 788 unsigned sampler, 789 unsigned msg_type, 790 unsigned response_length, 791 unsigned msg_length, 792 unsigned header_present, 793 unsigned simd_mode, 794 unsigned return_format) 795{ 796 struct brw_context *brw = p->brw; 797 798 brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length, 799 response_length, header_present, false); 800 801 if (brw->gen >= 7) { 802 insn->bits3.sampler_gen7.binding_table_index = binding_table_index; 803 insn->bits3.sampler_gen7.sampler = sampler; 804 insn->bits3.sampler_gen7.msg_type = msg_type; 805 insn->bits3.sampler_gen7.simd_mode = simd_mode; 806 } else if (brw->gen >= 5) { 807 insn->bits3.sampler_gen5.binding_table_index = binding_table_index; 808 insn->bits3.sampler_gen5.sampler = sampler; 809 insn->bits3.sampler_gen5.msg_type = msg_type; 810 insn->bits3.sampler_gen5.simd_mode = simd_mode; 811 } else if (brw->is_g4x) { 812 insn->bits3.sampler_g4x.binding_table_index = binding_table_index; 813 insn->bits3.sampler_g4x.sampler = sampler; 814 insn->bits3.sampler_g4x.msg_type = msg_type; 815 } else { 816 insn->bits3.sampler.binding_table_index = binding_table_index; 817 insn->bits3.sampler.sampler = sampler; 818 insn->bits3.sampler.msg_type = msg_type; 819 insn->bits3.sampler.return_format = return_format; 820 } 821} 822 823 824#define next_insn brw_next_insn 825struct brw_instruction * 826brw_next_insn(struct brw_compile *p, unsigned opcode) 827{ 828 struct brw_instruction *insn; 829 830 if (p->nr_insn + 1 > p->store_size) { 831 p->store_size <<= 1; 832 p->store = reralloc(p->mem_ctx, p->store, 833 struct brw_instruction, p->store_size); 834 } 835 836 p->next_insn_offset += 16; 837 insn = &p->store[p->nr_insn++]; 838 memcpy(insn, p->current, sizeof(*insn)); 839 840 insn->header.opcode = opcode; 841 return insn; 842} 843 844static struct brw_instruction *brw_alu1( struct brw_compile *p, 845 unsigned opcode, 846 struct brw_reg dest, 847 struct brw_reg src ) 848{ 849 struct brw_instruction *insn = next_insn(p, opcode); 850 brw_set_dest(p, insn, dest); 851 brw_set_src0(p, insn, src); 852 return insn; 853} 854 855static struct brw_instruction *brw_alu2(struct brw_compile *p, 856 unsigned opcode, 857 struct brw_reg dest, 858 struct brw_reg src0, 859 struct brw_reg src1 ) 860{ 861 struct brw_instruction *insn = next_insn(p, opcode); 862 brw_set_dest(p, insn, dest); 863 brw_set_src0(p, insn, src0); 864 brw_set_src1(p, insn, src1); 865 return insn; 866} 867 868static int 869get_3src_subreg_nr(struct brw_reg reg) 870{ 871 if (reg.vstride == BRW_VERTICAL_STRIDE_0) { 872 assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle)); 873 return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0); 874 } else { 875 return reg.subnr / 4; 876 } 877} 878 879static struct brw_instruction *brw_alu3(struct brw_compile *p, 880 unsigned opcode, 881 struct brw_reg dest, 882 struct brw_reg src0, 883 struct brw_reg src1, 884 struct brw_reg src2) 885{ 886 struct brw_context *brw = p->brw; 887 struct brw_instruction *insn = next_insn(p, opcode); 888 889 gen7_convert_mrf_to_grf(p, &dest); 890 891 assert(insn->header.access_mode == BRW_ALIGN_16); 892 893 assert(dest.file == BRW_GENERAL_REGISTER_FILE || 894 dest.file == BRW_MESSAGE_REGISTER_FILE); 895 assert(dest.nr < 128); 896 assert(dest.address_mode == BRW_ADDRESS_DIRECT); 897 assert(dest.type == BRW_REGISTER_TYPE_F || 898 dest.type == BRW_REGISTER_TYPE_D || 899 dest.type == BRW_REGISTER_TYPE_UD); 900 insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE); 901 insn->bits1.da3src.dest_reg_nr = dest.nr; 902 insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16; 903 insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask; 904 guess_execution_size(p, insn, dest); 905 906 assert(src0.file == BRW_GENERAL_REGISTER_FILE); 907 assert(src0.address_mode == BRW_ADDRESS_DIRECT); 908 assert(src0.nr < 128); 909 insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle; 910 insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0); 911 insn->bits2.da3src.src0_reg_nr = src0.nr; 912 insn->bits1.da3src.src0_abs = src0.abs; 913 insn->bits1.da3src.src0_negate = src0.negate; 914 insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0; 915 916 assert(src1.file == BRW_GENERAL_REGISTER_FILE); 917 assert(src1.address_mode == BRW_ADDRESS_DIRECT); 918 assert(src1.nr < 128); 919 insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle; 920 insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3; 921 insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2; 922 insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0; 923 insn->bits3.da3src.src1_reg_nr = src1.nr; 924 insn->bits1.da3src.src1_abs = src1.abs; 925 insn->bits1.da3src.src1_negate = src1.negate; 926 927 assert(src2.file == BRW_GENERAL_REGISTER_FILE); 928 assert(src2.address_mode == BRW_ADDRESS_DIRECT); 929 assert(src2.nr < 128); 930 insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle; 931 insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2); 932 insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0; 933 insn->bits3.da3src.src2_reg_nr = src2.nr; 934 insn->bits1.da3src.src2_abs = src2.abs; 935 insn->bits1.da3src.src2_negate = src2.negate; 936 937 if (brw->gen >= 7) { 938 /* Set both the source and destination types based on dest.type, 939 * ignoring the source register types. The MAD and LRP emitters ensure 940 * that all four types are float. The BFE and BFI2 emitters, however, 941 * may send us mixed D and UD types and want us to ignore that and use 942 * the destination type. 943 */ 944 switch (dest.type) { 945 case BRW_REGISTER_TYPE_F: 946 insn->bits1.da3src.src_type = BRW_3SRC_TYPE_F; 947 insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_F; 948 break; 949 case BRW_REGISTER_TYPE_D: 950 insn->bits1.da3src.src_type = BRW_3SRC_TYPE_D; 951 insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_D; 952 break; 953 case BRW_REGISTER_TYPE_UD: 954 insn->bits1.da3src.src_type = BRW_3SRC_TYPE_UD; 955 insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_UD; 956 break; 957 } 958 } 959 960 return insn; 961} 962 963 964/*********************************************************************** 965 * Convenience routines. 966 */ 967#define ALU1(OP) \ 968struct brw_instruction *brw_##OP(struct brw_compile *p, \ 969 struct brw_reg dest, \ 970 struct brw_reg src0) \ 971{ \ 972 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \ 973} 974 975#define ALU2(OP) \ 976struct brw_instruction *brw_##OP(struct brw_compile *p, \ 977 struct brw_reg dest, \ 978 struct brw_reg src0, \ 979 struct brw_reg src1) \ 980{ \ 981 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \ 982} 983 984#define ALU3(OP) \ 985struct brw_instruction *brw_##OP(struct brw_compile *p, \ 986 struct brw_reg dest, \ 987 struct brw_reg src0, \ 988 struct brw_reg src1, \ 989 struct brw_reg src2) \ 990{ \ 991 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \ 992} 993 994#define ALU3F(OP) \ 995struct brw_instruction *brw_##OP(struct brw_compile *p, \ 996 struct brw_reg dest, \ 997 struct brw_reg src0, \ 998 struct brw_reg src1, \ 999 struct brw_reg src2) \ 1000{ \ 1001 assert(dest.type == BRW_REGISTER_TYPE_F); \ 1002 assert(src0.type == BRW_REGISTER_TYPE_F); \ 1003 assert(src1.type == BRW_REGISTER_TYPE_F); \ 1004 assert(src2.type == BRW_REGISTER_TYPE_F); \ 1005 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \ 1006} 1007 1008/* Rounding operations (other than RNDD) require two instructions - the first 1009 * stores a rounded value (possibly the wrong way) in the dest register, but 1010 * also sets a per-channel "increment bit" in the flag register. A predicated 1011 * add of 1.0 fixes dest to contain the desired result. 1012 * 1013 * Sandybridge and later appear to round correctly without an ADD. 1014 */ 1015#define ROUND(OP) \ 1016void brw_##OP(struct brw_compile *p, \ 1017 struct brw_reg dest, \ 1018 struct brw_reg src) \ 1019{ \ 1020 struct brw_instruction *rnd, *add; \ 1021 rnd = next_insn(p, BRW_OPCODE_##OP); \ 1022 brw_set_dest(p, rnd, dest); \ 1023 brw_set_src0(p, rnd, src); \ 1024 \ 1025 if (p->brw->gen < 6) { \ 1026 /* turn on round-increments */ \ 1027 rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R; \ 1028 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \ 1029 add->header.predicate_control = BRW_PREDICATE_NORMAL; \ 1030 } \ 1031} 1032 1033 1034ALU1(MOV) 1035ALU2(SEL) 1036ALU1(NOT) 1037ALU2(AND) 1038ALU2(OR) 1039ALU2(XOR) 1040ALU2(SHR) 1041ALU2(SHL) 1042ALU2(ASR) 1043ALU1(F32TO16) 1044ALU1(F16TO32) 1045ALU1(FRC) 1046ALU1(RNDD) 1047ALU2(MAC) 1048ALU2(MACH) 1049ALU1(LZD) 1050ALU2(DP4) 1051ALU2(DPH) 1052ALU2(DP3) 1053ALU2(DP2) 1054ALU2(LINE) 1055ALU2(PLN) 1056ALU3F(MAD) 1057ALU3F(LRP) 1058ALU1(BFREV) 1059ALU3(BFE) 1060ALU2(BFI1) 1061ALU3(BFI2) 1062ALU1(FBH) 1063ALU1(FBL) 1064ALU1(CBIT) 1065ALU2(ADDC) 1066ALU2(SUBB) 1067 1068ROUND(RNDZ) 1069ROUND(RNDE) 1070 1071 1072struct brw_instruction *brw_ADD(struct brw_compile *p, 1073 struct brw_reg dest, 1074 struct brw_reg src0, 1075 struct brw_reg src1) 1076{ 1077 /* 6.2.2: add */ 1078 if (src0.type == BRW_REGISTER_TYPE_F || 1079 (src0.file == BRW_IMMEDIATE_VALUE && 1080 src0.type == BRW_REGISTER_TYPE_VF)) { 1081 assert(src1.type != BRW_REGISTER_TYPE_UD); 1082 assert(src1.type != BRW_REGISTER_TYPE_D); 1083 } 1084 1085 if (src1.type == BRW_REGISTER_TYPE_F || 1086 (src1.file == BRW_IMMEDIATE_VALUE && 1087 src1.type == BRW_REGISTER_TYPE_VF)) { 1088 assert(src0.type != BRW_REGISTER_TYPE_UD); 1089 assert(src0.type != BRW_REGISTER_TYPE_D); 1090 } 1091 1092 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1); 1093} 1094 1095struct brw_instruction *brw_AVG(struct brw_compile *p, 1096 struct brw_reg dest, 1097 struct brw_reg src0, 1098 struct brw_reg src1) 1099{ 1100 assert(dest.type == src0.type); 1101 assert(src0.type == src1.type); 1102 switch (src0.type) { 1103 case BRW_REGISTER_TYPE_B: 1104 case BRW_REGISTER_TYPE_UB: 1105 case BRW_REGISTER_TYPE_W: 1106 case BRW_REGISTER_TYPE_UW: 1107 case BRW_REGISTER_TYPE_D: 1108 case BRW_REGISTER_TYPE_UD: 1109 break; 1110 default: 1111 assert(!"Bad type for brw_AVG"); 1112 } 1113 1114 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1); 1115} 1116 1117struct brw_instruction *brw_MUL(struct brw_compile *p, 1118 struct brw_reg dest, 1119 struct brw_reg src0, 1120 struct brw_reg src1) 1121{ 1122 /* 6.32.38: mul */ 1123 if (src0.type == BRW_REGISTER_TYPE_D || 1124 src0.type == BRW_REGISTER_TYPE_UD || 1125 src1.type == BRW_REGISTER_TYPE_D || 1126 src1.type == BRW_REGISTER_TYPE_UD) { 1127 assert(dest.type != BRW_REGISTER_TYPE_F); 1128 } 1129 1130 if (src0.type == BRW_REGISTER_TYPE_F || 1131 (src0.file == BRW_IMMEDIATE_VALUE && 1132 src0.type == BRW_REGISTER_TYPE_VF)) { 1133 assert(src1.type != BRW_REGISTER_TYPE_UD); 1134 assert(src1.type != BRW_REGISTER_TYPE_D); 1135 } 1136 1137 if (src1.type == BRW_REGISTER_TYPE_F || 1138 (src1.file == BRW_IMMEDIATE_VALUE && 1139 src1.type == BRW_REGISTER_TYPE_VF)) { 1140 assert(src0.type != BRW_REGISTER_TYPE_UD); 1141 assert(src0.type != BRW_REGISTER_TYPE_D); 1142 } 1143 1144 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE || 1145 src0.nr != BRW_ARF_ACCUMULATOR); 1146 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE || 1147 src1.nr != BRW_ARF_ACCUMULATOR); 1148 1149 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1); 1150} 1151 1152 1153void brw_NOP(struct brw_compile *p) 1154{ 1155 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP); 1156 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD)); 1157 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD)); 1158 brw_set_src1(p, insn, brw_imm_ud(0x0)); 1159} 1160 1161 1162 1163 1164 1165/*********************************************************************** 1166 * Comparisons, if/else/endif 1167 */ 1168 1169struct brw_instruction *brw_JMPI(struct brw_compile *p, 1170 struct brw_reg index, 1171 unsigned predicate_control) 1172{ 1173 struct brw_reg ip = brw_ip_reg(); 1174 struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index); 1175 1176 insn->header.execution_size = 1; 1177 insn->header.compression_control = BRW_COMPRESSION_NONE; 1178 insn->header.mask_control = BRW_MASK_DISABLE; 1179 insn->header.predicate_control = predicate_control; 1180 1181 return insn; 1182} 1183 1184static void 1185push_if_stack(struct brw_compile *p, struct brw_instruction *inst) 1186{ 1187 p->if_stack[p->if_stack_depth] = inst - p->store; 1188 1189 p->if_stack_depth++; 1190 if (p->if_stack_array_size <= p->if_stack_depth) { 1191 p->if_stack_array_size *= 2; 1192 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int, 1193 p->if_stack_array_size); 1194 } 1195} 1196 1197static struct brw_instruction * 1198pop_if_stack(struct brw_compile *p) 1199{ 1200 p->if_stack_depth--; 1201 return &p->store[p->if_stack[p->if_stack_depth]]; 1202} 1203 1204static void 1205push_loop_stack(struct brw_compile *p, struct brw_instruction *inst) 1206{ 1207 if (p->loop_stack_array_size < p->loop_stack_depth) { 1208 p->loop_stack_array_size *= 2; 1209 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int, 1210 p->loop_stack_array_size); 1211 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int, 1212 p->loop_stack_array_size); 1213 } 1214 1215 p->loop_stack[p->loop_stack_depth] = inst - p->store; 1216 p->loop_stack_depth++; 1217 p->if_depth_in_loop[p->loop_stack_depth] = 0; 1218} 1219 1220static struct brw_instruction * 1221get_inner_do_insn(struct brw_compile *p) 1222{ 1223 return &p->store[p->loop_stack[p->loop_stack_depth - 1]]; 1224} 1225 1226/* EU takes the value from the flag register and pushes it onto some 1227 * sort of a stack (presumably merging with any flag value already on 1228 * the stack). Within an if block, the flags at the top of the stack 1229 * control execution on each channel of the unit, eg. on each of the 1230 * 16 pixel values in our wm programs. 1231 * 1232 * When the matching 'else' instruction is reached (presumably by 1233 * countdown of the instruction count patched in by our ELSE/ENDIF 1234 * functions), the relevent flags are inverted. 1235 * 1236 * When the matching 'endif' instruction is reached, the flags are 1237 * popped off. If the stack is now empty, normal execution resumes. 1238 */ 1239struct brw_instruction * 1240brw_IF(struct brw_compile *p, unsigned execute_size) 1241{ 1242 struct brw_context *brw = p->brw; 1243 struct brw_instruction *insn; 1244 1245 insn = next_insn(p, BRW_OPCODE_IF); 1246 1247 /* Override the defaults for this instruction: 1248 */ 1249 if (brw->gen < 6) { 1250 brw_set_dest(p, insn, brw_ip_reg()); 1251 brw_set_src0(p, insn, brw_ip_reg()); 1252 brw_set_src1(p, insn, brw_imm_d(0x0)); 1253 } else if (brw->gen == 6) { 1254 brw_set_dest(p, insn, brw_imm_w(0)); 1255 insn->bits1.branch_gen6.jump_count = 0; 1256 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); 1257 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); 1258 } else { 1259 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); 1260 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); 1261 brw_set_src1(p, insn, brw_imm_ud(0)); 1262 insn->bits3.break_cont.jip = 0; 1263 insn->bits3.break_cont.uip = 0; 1264 } 1265 1266 insn->header.execution_size = execute_size; 1267 insn->header.compression_control = BRW_COMPRESSION_NONE; 1268 insn->header.predicate_control = BRW_PREDICATE_NORMAL; 1269 insn->header.mask_control = BRW_MASK_ENABLE; 1270 if (!p->single_program_flow) 1271 insn->header.thread_control = BRW_THREAD_SWITCH; 1272 1273 push_if_stack(p, insn); 1274 p->if_depth_in_loop[p->loop_stack_depth]++; 1275 return insn; 1276} 1277 1278/* This function is only used for gen6-style IF instructions with an 1279 * embedded comparison (conditional modifier). It is not used on gen7. 1280 */ 1281struct brw_instruction * 1282gen6_IF(struct brw_compile *p, uint32_t conditional, 1283 struct brw_reg src0, struct brw_reg src1) 1284{ 1285 struct brw_instruction *insn; 1286 1287 insn = next_insn(p, BRW_OPCODE_IF); 1288 1289 brw_set_dest(p, insn, brw_imm_w(0)); 1290 if (p->compressed) { 1291 insn->header.execution_size = BRW_EXECUTE_16; 1292 } else { 1293 insn->header.execution_size = BRW_EXECUTE_8; 1294 } 1295 insn->bits1.branch_gen6.jump_count = 0; 1296 brw_set_src0(p, insn, src0); 1297 brw_set_src1(p, insn, src1); 1298 1299 assert(insn->header.compression_control == BRW_COMPRESSION_NONE); 1300 assert(insn->header.predicate_control == BRW_PREDICATE_NONE); 1301 insn->header.destreg__conditionalmod = conditional; 1302 1303 if (!p->single_program_flow) 1304 insn->header.thread_control = BRW_THREAD_SWITCH; 1305 1306 push_if_stack(p, insn); 1307 return insn; 1308} 1309 1310/** 1311 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs. 1312 */ 1313static void 1314convert_IF_ELSE_to_ADD(struct brw_compile *p, 1315 struct brw_instruction *if_inst, 1316 struct brw_instruction *else_inst) 1317{ 1318 /* The next instruction (where the ENDIF would be, if it existed) */ 1319 struct brw_instruction *next_inst = &p->store[p->nr_insn]; 1320 1321 assert(p->single_program_flow); 1322 assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF); 1323 assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE); 1324 assert(if_inst->header.execution_size == BRW_EXECUTE_1); 1325 1326 /* Convert IF to an ADD instruction that moves the instruction pointer 1327 * to the first instruction of the ELSE block. If there is no ELSE 1328 * block, point to where ENDIF would be. Reverse the predicate. 1329 * 1330 * There's no need to execute an ENDIF since we don't need to do any 1331 * stack operations, and if we're currently executing, we just want to 1332 * continue normally. 1333 */ 1334 if_inst->header.opcode = BRW_OPCODE_ADD; 1335 if_inst->header.predicate_inverse = 1; 1336 1337 if (else_inst != NULL) { 1338 /* Convert ELSE to an ADD instruction that points where the ENDIF 1339 * would be. 1340 */ 1341 else_inst->header.opcode = BRW_OPCODE_ADD; 1342 1343 if_inst->bits3.ud = (else_inst - if_inst + 1) * 16; 1344 else_inst->bits3.ud = (next_inst - else_inst) * 16; 1345 } else { 1346 if_inst->bits3.ud = (next_inst - if_inst) * 16; 1347 } 1348} 1349 1350/** 1351 * Patch IF and ELSE instructions with appropriate jump targets. 1352 */ 1353static void 1354patch_IF_ELSE(struct brw_compile *p, 1355 struct brw_instruction *if_inst, 1356 struct brw_instruction *else_inst, 1357 struct brw_instruction *endif_inst) 1358{ 1359 struct brw_context *brw = p->brw; 1360 1361 /* We shouldn't be patching IF and ELSE instructions in single program flow 1362 * mode when gen < 6, because in single program flow mode on those 1363 * platforms, we convert flow control instructions to conditional ADDs that 1364 * operate on IP (see brw_ENDIF). 1365 * 1366 * However, on Gen6, writing to IP doesn't work in single program flow mode 1367 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may 1368 * not be updated by non-flow control instructions."). And on later 1369 * platforms, there is no significant benefit to converting control flow 1370 * instructions to conditional ADDs. So we do patch IF and ELSE 1371 * instructions in single program flow mode on those platforms. 1372 */ 1373 if (brw->gen < 6) 1374 assert(!p->single_program_flow); 1375 1376 assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF); 1377 assert(endif_inst != NULL); 1378 assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE); 1379 1380 unsigned br = 1; 1381 /* Jump count is for 64bit data chunk each, so one 128bit instruction 1382 * requires 2 chunks. 1383 */ 1384 if (brw->gen >= 5) 1385 br = 2; 1386 1387 assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF); 1388 endif_inst->header.execution_size = if_inst->header.execution_size; 1389 1390 if (else_inst == NULL) { 1391 /* Patch IF -> ENDIF */ 1392 if (brw->gen < 6) { 1393 /* Turn it into an IFF, which means no mask stack operations for 1394 * all-false and jumping past the ENDIF. 1395 */ 1396 if_inst->header.opcode = BRW_OPCODE_IFF; 1397 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1); 1398 if_inst->bits3.if_else.pop_count = 0; 1399 if_inst->bits3.if_else.pad0 = 0; 1400 } else if (brw->gen == 6) { 1401 /* As of gen6, there is no IFF and IF must point to the ENDIF. */ 1402 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst); 1403 } else { 1404 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst); 1405 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst); 1406 } 1407 } else { 1408 else_inst->header.execution_size = if_inst->header.execution_size; 1409 1410 /* Patch IF -> ELSE */ 1411 if (brw->gen < 6) { 1412 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst); 1413 if_inst->bits3.if_else.pop_count = 0; 1414 if_inst->bits3.if_else.pad0 = 0; 1415 } else if (brw->gen == 6) { 1416 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1); 1417 } 1418 1419 /* Patch ELSE -> ENDIF */ 1420 if (brw->gen < 6) { 1421 /* BRW_OPCODE_ELSE pre-gen6 should point just past the 1422 * matching ENDIF. 1423 */ 1424 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1); 1425 else_inst->bits3.if_else.pop_count = 1; 1426 else_inst->bits3.if_else.pad0 = 0; 1427 } else if (brw->gen == 6) { 1428 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */ 1429 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst); 1430 } else { 1431 /* The IF instruction's JIP should point just past the ELSE */ 1432 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1); 1433 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */ 1434 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst); 1435 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst); 1436 } 1437 } 1438} 1439 1440void 1441brw_ELSE(struct brw_compile *p) 1442{ 1443 struct brw_context *brw = p->brw; 1444 struct brw_instruction *insn; 1445 1446 insn = next_insn(p, BRW_OPCODE_ELSE); 1447 1448 if (brw->gen < 6) { 1449 brw_set_dest(p, insn, brw_ip_reg()); 1450 brw_set_src0(p, insn, brw_ip_reg()); 1451 brw_set_src1(p, insn, brw_imm_d(0x0)); 1452 } else if (brw->gen == 6) { 1453 brw_set_dest(p, insn, brw_imm_w(0)); 1454 insn->bits1.branch_gen6.jump_count = 0; 1455 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1456 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1457 } else { 1458 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1459 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1460 brw_set_src1(p, insn, brw_imm_ud(0)); 1461 insn->bits3.break_cont.jip = 0; 1462 insn->bits3.break_cont.uip = 0; 1463 } 1464 1465 insn->header.compression_control = BRW_COMPRESSION_NONE; 1466 insn->header.mask_control = BRW_MASK_ENABLE; 1467 if (!p->single_program_flow) 1468 insn->header.thread_control = BRW_THREAD_SWITCH; 1469 1470 push_if_stack(p, insn); 1471} 1472 1473void 1474brw_ENDIF(struct brw_compile *p) 1475{ 1476 struct brw_context *brw = p->brw; 1477 struct brw_instruction *insn = NULL; 1478 struct brw_instruction *else_inst = NULL; 1479 struct brw_instruction *if_inst = NULL; 1480 struct brw_instruction *tmp; 1481 bool emit_endif = true; 1482 1483 /* In single program flow mode, we can express IF and ELSE instructions 1484 * equivalently as ADD instructions that operate on IP. On platforms prior 1485 * to Gen6, flow control instructions cause an implied thread switch, so 1486 * this is a significant savings. 1487 * 1488 * However, on Gen6, writing to IP doesn't work in single program flow mode 1489 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may 1490 * not be updated by non-flow control instructions."). And on later 1491 * platforms, there is no significant benefit to converting control flow 1492 * instructions to conditional ADDs. So we only do this trick on Gen4 and 1493 * Gen5. 1494 */ 1495 if (brw->gen < 6 && p->single_program_flow) 1496 emit_endif = false; 1497 1498 /* 1499 * A single next_insn() may change the base adress of instruction store 1500 * memory(p->store), so call it first before referencing the instruction 1501 * store pointer from an index 1502 */ 1503 if (emit_endif) 1504 insn = next_insn(p, BRW_OPCODE_ENDIF); 1505 1506 /* Pop the IF and (optional) ELSE instructions from the stack */ 1507 p->if_depth_in_loop[p->loop_stack_depth]--; 1508 tmp = pop_if_stack(p); 1509 if (tmp->header.opcode == BRW_OPCODE_ELSE) { 1510 else_inst = tmp; 1511 tmp = pop_if_stack(p); 1512 } 1513 if_inst = tmp; 1514 1515 if (!emit_endif) { 1516 /* ENDIF is useless; don't bother emitting it. */ 1517 convert_IF_ELSE_to_ADD(p, if_inst, else_inst); 1518 return; 1519 } 1520 1521 if (brw->gen < 6) { 1522 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD)); 1523 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD)); 1524 brw_set_src1(p, insn, brw_imm_d(0x0)); 1525 } else if (brw->gen == 6) { 1526 brw_set_dest(p, insn, brw_imm_w(0)); 1527 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1528 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1529 } else { 1530 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1531 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1532 brw_set_src1(p, insn, brw_imm_ud(0)); 1533 } 1534 1535 insn->header.compression_control = BRW_COMPRESSION_NONE; 1536 insn->header.mask_control = BRW_MASK_ENABLE; 1537 insn->header.thread_control = BRW_THREAD_SWITCH; 1538 1539 /* Also pop item off the stack in the endif instruction: */ 1540 if (brw->gen < 6) { 1541 insn->bits3.if_else.jump_count = 0; 1542 insn->bits3.if_else.pop_count = 1; 1543 insn->bits3.if_else.pad0 = 0; 1544 } else if (brw->gen == 6) { 1545 insn->bits1.branch_gen6.jump_count = 2; 1546 } else { 1547 insn->bits3.break_cont.jip = 2; 1548 } 1549 patch_IF_ELSE(p, if_inst, else_inst, insn); 1550} 1551 1552struct brw_instruction *brw_BREAK(struct brw_compile *p) 1553{ 1554 struct brw_context *brw = p->brw; 1555 struct brw_instruction *insn; 1556 1557 insn = next_insn(p, BRW_OPCODE_BREAK); 1558 if (brw->gen >= 6) { 1559 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1560 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1561 brw_set_src1(p, insn, brw_imm_d(0x0)); 1562 } else { 1563 brw_set_dest(p, insn, brw_ip_reg()); 1564 brw_set_src0(p, insn, brw_ip_reg()); 1565 brw_set_src1(p, insn, brw_imm_d(0x0)); 1566 insn->bits3.if_else.pad0 = 0; 1567 insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth]; 1568 } 1569 insn->header.compression_control = BRW_COMPRESSION_NONE; 1570 insn->header.execution_size = BRW_EXECUTE_8; 1571 1572 return insn; 1573} 1574 1575struct brw_instruction *gen6_CONT(struct brw_compile *p) 1576{ 1577 struct brw_instruction *insn; 1578 1579 insn = next_insn(p, BRW_OPCODE_CONTINUE); 1580 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1581 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1582 brw_set_dest(p, insn, brw_ip_reg()); 1583 brw_set_src0(p, insn, brw_ip_reg()); 1584 brw_set_src1(p, insn, brw_imm_d(0x0)); 1585 1586 insn->header.compression_control = BRW_COMPRESSION_NONE; 1587 insn->header.execution_size = BRW_EXECUTE_8; 1588 return insn; 1589} 1590 1591struct brw_instruction *brw_CONT(struct brw_compile *p) 1592{ 1593 struct brw_instruction *insn; 1594 insn = next_insn(p, BRW_OPCODE_CONTINUE); 1595 brw_set_dest(p, insn, brw_ip_reg()); 1596 brw_set_src0(p, insn, brw_ip_reg()); 1597 brw_set_src1(p, insn, brw_imm_d(0x0)); 1598 insn->header.compression_control = BRW_COMPRESSION_NONE; 1599 insn->header.execution_size = BRW_EXECUTE_8; 1600 /* insn->header.mask_control = BRW_MASK_DISABLE; */ 1601 insn->bits3.if_else.pad0 = 0; 1602 insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth]; 1603 return insn; 1604} 1605 1606struct brw_instruction *gen6_HALT(struct brw_compile *p) 1607{ 1608 struct brw_instruction *insn; 1609 1610 insn = next_insn(p, BRW_OPCODE_HALT); 1611 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1612 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1613 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */ 1614 1615 if (p->compressed) { 1616 insn->header.execution_size = BRW_EXECUTE_16; 1617 } else { 1618 insn->header.compression_control = BRW_COMPRESSION_NONE; 1619 insn->header.execution_size = BRW_EXECUTE_8; 1620 } 1621 return insn; 1622} 1623 1624/* DO/WHILE loop: 1625 * 1626 * The DO/WHILE is just an unterminated loop -- break or continue are 1627 * used for control within the loop. We have a few ways they can be 1628 * done. 1629 * 1630 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip, 1631 * jip and no DO instruction. 1632 * 1633 * For non-uniform control flow pre-gen6, there's a DO instruction to 1634 * push the mask, and a WHILE to jump back, and BREAK to get out and 1635 * pop the mask. 1636 * 1637 * For gen6, there's no more mask stack, so no need for DO. WHILE 1638 * just points back to the first instruction of the loop. 1639 */ 1640struct brw_instruction *brw_DO(struct brw_compile *p, unsigned execute_size) 1641{ 1642 struct brw_context *brw = p->brw; 1643 1644 if (brw->gen >= 6 || p->single_program_flow) { 1645 push_loop_stack(p, &p->store[p->nr_insn]); 1646 return &p->store[p->nr_insn]; 1647 } else { 1648 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO); 1649 1650 push_loop_stack(p, insn); 1651 1652 /* Override the defaults for this instruction: 1653 */ 1654 brw_set_dest(p, insn, brw_null_reg()); 1655 brw_set_src0(p, insn, brw_null_reg()); 1656 brw_set_src1(p, insn, brw_null_reg()); 1657 1658 insn->header.compression_control = BRW_COMPRESSION_NONE; 1659 insn->header.execution_size = execute_size; 1660 insn->header.predicate_control = BRW_PREDICATE_NONE; 1661 /* insn->header.mask_control = BRW_MASK_ENABLE; */ 1662 /* insn->header.mask_control = BRW_MASK_DISABLE; */ 1663 1664 return insn; 1665 } 1666} 1667 1668/** 1669 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE 1670 * instruction here. 1671 * 1672 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop 1673 * nesting, since it can always just point to the end of the block/current loop. 1674 */ 1675static void 1676brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst) 1677{ 1678 struct brw_context *brw = p->brw; 1679 struct brw_instruction *do_inst = get_inner_do_insn(p); 1680 struct brw_instruction *inst; 1681 int br = (brw->gen == 5) ? 2 : 1; 1682 1683 for (inst = while_inst - 1; inst != do_inst; inst--) { 1684 /* If the jump count is != 0, that means that this instruction has already 1685 * been patched because it's part of a loop inside of the one we're 1686 * patching. 1687 */ 1688 if (inst->header.opcode == BRW_OPCODE_BREAK && 1689 inst->bits3.if_else.jump_count == 0) { 1690 inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1); 1691 } else if (inst->header.opcode == BRW_OPCODE_CONTINUE && 1692 inst->bits3.if_else.jump_count == 0) { 1693 inst->bits3.if_else.jump_count = br * (while_inst - inst); 1694 } 1695 } 1696} 1697 1698struct brw_instruction *brw_WHILE(struct brw_compile *p) 1699{ 1700 struct brw_context *brw = p->brw; 1701 struct brw_instruction *insn, *do_insn; 1702 unsigned br = 1; 1703 1704 if (brw->gen >= 5) 1705 br = 2; 1706 1707 if (brw->gen >= 7) { 1708 insn = next_insn(p, BRW_OPCODE_WHILE); 1709 do_insn = get_inner_do_insn(p); 1710 1711 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1712 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1713 brw_set_src1(p, insn, brw_imm_ud(0)); 1714 insn->bits3.break_cont.jip = br * (do_insn - insn); 1715 1716 insn->header.execution_size = BRW_EXECUTE_8; 1717 } else if (brw->gen == 6) { 1718 insn = next_insn(p, BRW_OPCODE_WHILE); 1719 do_insn = get_inner_do_insn(p); 1720 1721 brw_set_dest(p, insn, brw_imm_w(0)); 1722 insn->bits1.branch_gen6.jump_count = br * (do_insn - insn); 1723 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1724 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1725 1726 insn->header.execution_size = BRW_EXECUTE_8; 1727 } else { 1728 if (p->single_program_flow) { 1729 insn = next_insn(p, BRW_OPCODE_ADD); 1730 do_insn = get_inner_do_insn(p); 1731 1732 brw_set_dest(p, insn, brw_ip_reg()); 1733 brw_set_src0(p, insn, brw_ip_reg()); 1734 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16)); 1735 insn->header.execution_size = BRW_EXECUTE_1; 1736 } else { 1737 insn = next_insn(p, BRW_OPCODE_WHILE); 1738 do_insn = get_inner_do_insn(p); 1739 1740 assert(do_insn->header.opcode == BRW_OPCODE_DO); 1741 1742 brw_set_dest(p, insn, brw_ip_reg()); 1743 brw_set_src0(p, insn, brw_ip_reg()); 1744 brw_set_src1(p, insn, brw_imm_d(0)); 1745 1746 insn->header.execution_size = do_insn->header.execution_size; 1747 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1); 1748 insn->bits3.if_else.pop_count = 0; 1749 insn->bits3.if_else.pad0 = 0; 1750 1751 brw_patch_break_cont(p, insn); 1752 } 1753 } 1754 insn->header.compression_control = BRW_COMPRESSION_NONE; 1755 1756 p->loop_stack_depth--; 1757 1758 return insn; 1759} 1760 1761/* FORWARD JUMPS: 1762 */ 1763void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx) 1764{ 1765 struct brw_context *brw = p->brw; 1766 struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx]; 1767 unsigned jmpi = 1; 1768 1769 if (brw->gen >= 5) 1770 jmpi = 2; 1771 1772 assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI); 1773 assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE); 1774 1775 jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1); 1776} 1777 1778/* To integrate with the above, it makes sense that the comparison 1779 * instruction should populate the flag register. It might be simpler 1780 * just to use the flag reg for most WM tasks? 1781 */ 1782void brw_CMP(struct brw_compile *p, 1783 struct brw_reg dest, 1784 unsigned conditional, 1785 struct brw_reg src0, 1786 struct brw_reg src1) 1787{ 1788 struct brw_context *brw = p->brw; 1789 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP); 1790 1791 insn->header.destreg__conditionalmod = conditional; 1792 brw_set_dest(p, insn, dest); 1793 brw_set_src0(p, insn, src0); 1794 brw_set_src1(p, insn, src1); 1795 1796 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds 1797 * page says: 1798 * "Any CMP instruction with a null destination must use a {switch}." 1799 * 1800 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't 1801 * mentioned on their work-arounds pages. 1802 */ 1803 if (brw->gen == 7) { 1804 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE && 1805 dest.nr == BRW_ARF_NULL) { 1806 insn->header.thread_control = BRW_THREAD_SWITCH; 1807 } 1808 } 1809} 1810 1811/* Issue 'wait' instruction for n1, host could program MMIO 1812 to wake up thread. */ 1813void brw_WAIT (struct brw_compile *p) 1814{ 1815 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT); 1816 struct brw_reg src = brw_notification_1_reg(); 1817 1818 brw_set_dest(p, insn, src); 1819 brw_set_src0(p, insn, src); 1820 brw_set_src1(p, insn, brw_null_reg()); 1821 insn->header.execution_size = 0; /* must */ 1822 insn->header.predicate_control = 0; 1823 insn->header.compression_control = 0; 1824} 1825 1826 1827/*********************************************************************** 1828 * Helpers for the various SEND message types: 1829 */ 1830 1831/** Extended math function, float[8]. 1832 */ 1833void brw_math( struct brw_compile *p, 1834 struct brw_reg dest, 1835 unsigned function, 1836 unsigned msg_reg_nr, 1837 struct brw_reg src, 1838 unsigned data_type, 1839 unsigned precision ) 1840{ 1841 struct brw_context *brw = p->brw; 1842 1843 if (brw->gen >= 6) { 1844 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH); 1845 1846 assert(dest.file == BRW_GENERAL_REGISTER_FILE || 1847 (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE)); 1848 assert(src.file == BRW_GENERAL_REGISTER_FILE); 1849 1850 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1); 1851 if (brw->gen == 6) 1852 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1); 1853 1854 /* Source modifiers are ignored for extended math instructions on Gen6. */ 1855 if (brw->gen == 6) { 1856 assert(!src.negate); 1857 assert(!src.abs); 1858 } 1859 1860 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT || 1861 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER || 1862 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) { 1863 assert(src.type != BRW_REGISTER_TYPE_F); 1864 } else { 1865 assert(src.type == BRW_REGISTER_TYPE_F); 1866 } 1867 1868 /* Math is the same ISA format as other opcodes, except that CondModifier 1869 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4]. 1870 */ 1871 insn->header.destreg__conditionalmod = function; 1872 1873 brw_set_dest(p, insn, dest); 1874 brw_set_src0(p, insn, src); 1875 brw_set_src1(p, insn, brw_null_reg()); 1876 } else { 1877 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); 1878 1879 /* Example code doesn't set predicate_control for send 1880 * instructions. 1881 */ 1882 insn->header.predicate_control = 0; 1883 insn->header.destreg__conditionalmod = msg_reg_nr; 1884 1885 brw_set_dest(p, insn, dest); 1886 brw_set_src0(p, insn, src); 1887 brw_set_math_message(p, 1888 insn, 1889 function, 1890 src.type == BRW_REGISTER_TYPE_D, 1891 precision, 1892 data_type); 1893 } 1894} 1895 1896/** Extended math function, float[8]. 1897 */ 1898void brw_math2(struct brw_compile *p, 1899 struct brw_reg dest, 1900 unsigned function, 1901 struct brw_reg src0, 1902 struct brw_reg src1) 1903{ 1904 struct brw_context *brw = p->brw; 1905 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH); 1906 1907 assert(dest.file == BRW_GENERAL_REGISTER_FILE || 1908 (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE)); 1909 assert(src0.file == BRW_GENERAL_REGISTER_FILE); 1910 assert(src1.file == BRW_GENERAL_REGISTER_FILE); 1911 1912 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1); 1913 if (brw->gen == 6) { 1914 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1); 1915 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1); 1916 } 1917 1918 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT || 1919 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER || 1920 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) { 1921 assert(src0.type != BRW_REGISTER_TYPE_F); 1922 assert(src1.type != BRW_REGISTER_TYPE_F); 1923 } else { 1924 assert(src0.type == BRW_REGISTER_TYPE_F); 1925 assert(src1.type == BRW_REGISTER_TYPE_F); 1926 } 1927 1928 /* Source modifiers are ignored for extended math instructions on Gen6. */ 1929 if (brw->gen == 6) { 1930 assert(!src0.negate); 1931 assert(!src0.abs); 1932 assert(!src1.negate); 1933 assert(!src1.abs); 1934 } 1935 1936 /* Math is the same ISA format as other opcodes, except that CondModifier 1937 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4]. 1938 */ 1939 insn->header.destreg__conditionalmod = function; 1940 1941 brw_set_dest(p, insn, dest); 1942 brw_set_src0(p, insn, src0); 1943 brw_set_src1(p, insn, src1); 1944} 1945 1946 1947/** 1948 * Write a block of OWORDs (half a GRF each) from the scratch buffer, 1949 * using a constant offset per channel. 1950 * 1951 * The offset must be aligned to oword size (16 bytes). Used for 1952 * register spilling. 1953 */ 1954void brw_oword_block_write_scratch(struct brw_compile *p, 1955 struct brw_reg mrf, 1956 int num_regs, 1957 unsigned offset) 1958{ 1959 struct brw_context *brw = p->brw; 1960 uint32_t msg_control, msg_type; 1961 int mlen; 1962 1963 if (brw->gen >= 6) 1964 offset /= 16; 1965 1966 mrf = retype(mrf, BRW_REGISTER_TYPE_UD); 1967 1968 if (num_regs == 1) { 1969 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS; 1970 mlen = 2; 1971 } else { 1972 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS; 1973 mlen = 3; 1974 } 1975 1976 /* Set up the message header. This is g0, with g0.2 filled with 1977 * the offset. We don't want to leave our offset around in g0 or 1978 * it'll screw up texture samples, so set it up inside the message 1979 * reg. 1980 */ 1981 { 1982 brw_push_insn_state(p); 1983 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1984 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 1985 1986 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 1987 1988 /* set message header global offset field (reg 0, element 2) */ 1989 brw_MOV(p, 1990 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 1991 mrf.nr, 1992 2), BRW_REGISTER_TYPE_UD), 1993 brw_imm_ud(offset)); 1994 1995 brw_pop_insn_state(p); 1996 } 1997 1998 { 1999 struct brw_reg dest; 2000 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); 2001 int send_commit_msg; 2002 struct brw_reg src_header = retype(brw_vec8_grf(0, 0), 2003 BRW_REGISTER_TYPE_UW); 2004 2005 if (insn->header.compression_control != BRW_COMPRESSION_NONE) { 2006 insn->header.compression_control = BRW_COMPRESSION_NONE; 2007 src_header = vec16(src_header); 2008 } 2009 assert(insn->header.predicate_control == BRW_PREDICATE_NONE); 2010 insn->header.destreg__conditionalmod = mrf.nr; 2011 2012 /* Until gen6, writes followed by reads from the same location 2013 * are not guaranteed to be ordered unless write_commit is set. 2014 * If set, then a no-op write is issued to the destination 2015 * register to set a dependency, and a read from the destination 2016 * can be used to ensure the ordering. 2017 * 2018 * For gen6, only writes between different threads need ordering 2019 * protection. Our use of DP writes is all about register 2020 * spilling within a thread. 2021 */ 2022 if (brw->gen >= 6) { 2023 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW); 2024 send_commit_msg = 0; 2025 } else { 2026 dest = src_header; 2027 send_commit_msg = 1; 2028 } 2029 2030 brw_set_dest(p, insn, dest); 2031 if (brw->gen >= 6) { 2032 brw_set_src0(p, insn, mrf); 2033 } else { 2034 brw_set_src0(p, insn, brw_null_reg()); 2035 } 2036 2037 if (brw->gen >= 6) 2038 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE; 2039 else 2040 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE; 2041 2042 brw_set_dp_write_message(p, 2043 insn, 2044 255, /* binding table index (255=stateless) */ 2045 msg_control, 2046 msg_type, 2047 mlen, 2048 true, /* header_present */ 2049 0, /* not a render target */ 2050 send_commit_msg, /* response_length */ 2051 0, /* eot */ 2052 send_commit_msg); 2053 } 2054} 2055 2056 2057/** 2058 * Read a block of owords (half a GRF each) from the scratch buffer 2059 * using a constant index per channel. 2060 * 2061 * Offset must be aligned to oword size (16 bytes). Used for register 2062 * spilling. 2063 */ 2064void 2065brw_oword_block_read_scratch(struct brw_compile *p, 2066 struct brw_reg dest, 2067 struct brw_reg mrf, 2068 int num_regs, 2069 unsigned offset) 2070{ 2071 struct brw_context *brw = p->brw; 2072 uint32_t msg_control; 2073 int rlen; 2074 2075 if (brw->gen >= 6) 2076 offset /= 16; 2077 2078 mrf = retype(mrf, BRW_REGISTER_TYPE_UD); 2079 dest = retype(dest, BRW_REGISTER_TYPE_UW); 2080 2081 if (num_regs == 1) { 2082 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS; 2083 rlen = 1; 2084 } else { 2085 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS; 2086 rlen = 2; 2087 } 2088 2089 { 2090 brw_push_insn_state(p); 2091 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 2092 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 2093 2094 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 2095 2096 /* set message header global offset field (reg 0, element 2) */ 2097 brw_MOV(p, 2098 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 2099 mrf.nr, 2100 2), BRW_REGISTER_TYPE_UD), 2101 brw_imm_ud(offset)); 2102 2103 brw_pop_insn_state(p); 2104 } 2105 2106 { 2107 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); 2108 2109 assert(insn->header.predicate_control == 0); 2110 insn->header.compression_control = BRW_COMPRESSION_NONE; 2111 insn->header.destreg__conditionalmod = mrf.nr; 2112 2113 brw_set_dest(p, insn, dest); /* UW? */ 2114 if (brw->gen >= 6) { 2115 brw_set_src0(p, insn, mrf); 2116 } else { 2117 brw_set_src0(p, insn, brw_null_reg()); 2118 } 2119 2120 brw_set_dp_read_message(p, 2121 insn, 2122 255, /* binding table index (255=stateless) */ 2123 msg_control, 2124 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */ 2125 BRW_DATAPORT_READ_TARGET_RENDER_CACHE, 2126 1, /* msg_length */ 2127 true, /* header_present */ 2128 rlen); 2129 } 2130} 2131 2132void 2133gen7_block_read_scratch(struct brw_compile *p, 2134 struct brw_reg dest, 2135 int num_regs, 2136 unsigned offset) 2137{ 2138 dest = retype(dest, BRW_REGISTER_TYPE_UW); 2139 2140 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); 2141 2142 assert(insn->header.predicate_control == BRW_PREDICATE_NONE); 2143 insn->header.compression_control = BRW_COMPRESSION_NONE; 2144 2145 brw_set_dest(p, insn, dest); 2146 2147 /* The HW requires that the header is present; this is to get the g0.5 2148 * scratch offset. 2149 */ 2150 bool header_present = true; 2151 brw_set_src0(p, insn, brw_vec8_grf(0, 0)); 2152 2153 brw_set_message_descriptor(p, insn, 2154 GEN7_SFID_DATAPORT_DATA_CACHE, 2155 1, /* mlen: just g0 */ 2156 num_regs, 2157 header_present, 2158 false); 2159 2160 insn->bits3.ud |= GEN7_DATAPORT_SCRATCH_READ; 2161 2162 assert(num_regs == 1 || num_regs == 2 || num_regs == 4); 2163 insn->bits3.ud |= (num_regs - 1) << GEN7_DATAPORT_SCRATCH_NUM_REGS_SHIFT; 2164 2165 /* According to the docs, offset is "A 12-bit HWord offset into the memory 2166 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD 2167 * is 32 bytes, which happens to be the size of a register. 2168 */ 2169 offset /= REG_SIZE; 2170 assert(offset < (1 << 12)); 2171 insn->bits3.ud |= offset; 2172} 2173 2174/** 2175 * Read a float[4] vector from the data port Data Cache (const buffer). 2176 * Location (in buffer) should be a multiple of 16. 2177 * Used for fetching shader constants. 2178 */ 2179void brw_oword_block_read(struct brw_compile *p, 2180 struct brw_reg dest, 2181 struct brw_reg mrf, 2182 uint32_t offset, 2183 uint32_t bind_table_index) 2184{ 2185 struct brw_context *brw = p->brw; 2186 2187 /* On newer hardware, offset is in units of owords. */ 2188 if (brw->gen >= 6) 2189 offset /= 16; 2190 2191 mrf = retype(mrf, BRW_REGISTER_TYPE_UD); 2192 2193 brw_push_insn_state(p); 2194 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 2195 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 2196 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 2197 2198 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 2199 2200 /* set message header global offset field (reg 0, element 2) */ 2201 brw_MOV(p, 2202 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 2203 mrf.nr, 2204 2), BRW_REGISTER_TYPE_UD), 2205 brw_imm_ud(offset)); 2206 2207 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); 2208 insn->header.destreg__conditionalmod = mrf.nr; 2209 2210 /* cast dest to a uword[8] vector */ 2211 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW); 2212 2213 brw_set_dest(p, insn, dest); 2214 if (brw->gen >= 6) { 2215 brw_set_src0(p, insn, mrf); 2216 } else { 2217 brw_set_src0(p, insn, brw_null_reg()); 2218 } 2219 2220 brw_set_dp_read_message(p, 2221 insn, 2222 bind_table_index, 2223 BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW, 2224 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, 2225 BRW_DATAPORT_READ_TARGET_DATA_CACHE, 2226 1, /* msg_length */ 2227 true, /* header_present */ 2228 1); /* response_length (1 reg, 2 owords!) */ 2229 2230 brw_pop_insn_state(p); 2231} 2232 2233 2234void brw_fb_WRITE(struct brw_compile *p, 2235 int dispatch_width, 2236 unsigned msg_reg_nr, 2237 struct brw_reg src0, 2238 unsigned msg_control, 2239 unsigned binding_table_index, 2240 unsigned msg_length, 2241 unsigned response_length, 2242 bool eot, 2243 bool header_present) 2244{ 2245 struct brw_context *brw = p->brw; 2246 struct brw_instruction *insn; 2247 unsigned msg_type; 2248 struct brw_reg dest; 2249 2250 if (dispatch_width == 16) 2251 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW); 2252 else 2253 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW); 2254 2255 if (brw->gen >= 6) { 2256 insn = next_insn(p, BRW_OPCODE_SENDC); 2257 } else { 2258 insn = next_insn(p, BRW_OPCODE_SEND); 2259 } 2260 insn->header.compression_control = BRW_COMPRESSION_NONE; 2261 2262 if (brw->gen >= 6) { 2263 /* headerless version, just submit color payload */ 2264 src0 = brw_message_reg(msg_reg_nr); 2265 2266 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE; 2267 } else { 2268 insn->header.destreg__conditionalmod = msg_reg_nr; 2269 2270 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE; 2271 } 2272 2273 brw_set_dest(p, insn, dest); 2274 brw_set_src0(p, insn, src0); 2275 brw_set_dp_write_message(p, 2276 insn, 2277 binding_table_index, 2278 msg_control, 2279 msg_type, 2280 msg_length, 2281 header_present, 2282 eot, /* last render target write */ 2283 response_length, 2284 eot, 2285 0 /* send_commit_msg */); 2286} 2287 2288 2289/** 2290 * Texture sample instruction. 2291 * Note: the msg_type plus msg_length values determine exactly what kind 2292 * of sampling operation is performed. See volume 4, page 161 of docs. 2293 */ 2294void brw_SAMPLE(struct brw_compile *p, 2295 struct brw_reg dest, 2296 unsigned msg_reg_nr, 2297 struct brw_reg src0, 2298 unsigned binding_table_index, 2299 unsigned sampler, 2300 unsigned msg_type, 2301 unsigned response_length, 2302 unsigned msg_length, 2303 unsigned header_present, 2304 unsigned simd_mode, 2305 unsigned return_format) 2306{ 2307 struct brw_context *brw = p->brw; 2308 struct brw_instruction *insn; 2309 2310 if (msg_reg_nr != -1) 2311 gen6_resolve_implied_move(p, &src0, msg_reg_nr); 2312 2313 insn = next_insn(p, BRW_OPCODE_SEND); 2314 insn->header.predicate_control = 0; /* XXX */ 2315 2316 /* From the 965 PRM (volume 4, part 1, section 14.2.41): 2317 * 2318 * "Instruction compression is not allowed for this instruction (that 2319 * is, send). The hardware behavior is undefined if this instruction is 2320 * set as compressed. However, compress control can be set to "SecHalf" 2321 * to affect the EMask generation." 2322 * 2323 * No similar wording is found in later PRMs, but there are examples 2324 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages 2325 * are allowed in SIMD16 mode and they could not work without SecHalf. For 2326 * these reasons, we allow BRW_COMPRESSION_2NDHALF here. 2327 */ 2328 if (insn->header.compression_control != BRW_COMPRESSION_2NDHALF) 2329 insn->header.compression_control = BRW_COMPRESSION_NONE; 2330 2331 if (brw->gen < 6) 2332 insn->header.destreg__conditionalmod = msg_reg_nr; 2333 2334 brw_set_dest(p, insn, dest); 2335 brw_set_src0(p, insn, src0); 2336 brw_set_sampler_message(p, insn, 2337 binding_table_index, 2338 sampler, 2339 msg_type, 2340 response_length, 2341 msg_length, 2342 header_present, 2343 simd_mode, 2344 return_format); 2345} 2346 2347/* All these variables are pretty confusing - we might be better off 2348 * using bitmasks and macros for this, in the old style. Or perhaps 2349 * just having the caller instantiate the fields in dword3 itself. 2350 */ 2351void brw_urb_WRITE(struct brw_compile *p, 2352 struct brw_reg dest, 2353 unsigned msg_reg_nr, 2354 struct brw_reg src0, 2355 enum brw_urb_write_flags flags, 2356 unsigned msg_length, 2357 unsigned response_length, 2358 unsigned offset, 2359 unsigned swizzle) 2360{ 2361 struct brw_context *brw = p->brw; 2362 struct brw_instruction *insn; 2363 2364 gen6_resolve_implied_move(p, &src0, msg_reg_nr); 2365 2366 if (brw->gen == 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) { 2367 /* Enable Channel Masks in the URB_WRITE_HWORD message header */ 2368 brw_push_insn_state(p); 2369 brw_set_default_access_mode(p, BRW_ALIGN_1); 2370 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 2371 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5), 2372 BRW_REGISTER_TYPE_UD), 2373 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), 2374 brw_imm_ud(0xff00)); 2375 brw_pop_insn_state(p); 2376 } 2377 2378 insn = next_insn(p, BRW_OPCODE_SEND); 2379 2380 assert(msg_length < BRW_MAX_MRF); 2381 2382 brw_set_dest(p, insn, dest); 2383 brw_set_src0(p, insn, src0); 2384 brw_set_src1(p, insn, brw_imm_d(0)); 2385 2386 if (brw->gen < 6) 2387 insn->header.destreg__conditionalmod = msg_reg_nr; 2388 2389 brw_set_urb_message(p, 2390 insn, 2391 flags, 2392 msg_length, 2393 response_length, 2394 offset, 2395 swizzle); 2396} 2397 2398static int 2399brw_find_next_block_end(struct brw_compile *p, int start_offset) 2400{ 2401 int offset; 2402 void *store = p->store; 2403 2404 for (offset = next_offset(store, start_offset); offset < p->next_insn_offset; 2405 offset = next_offset(store, offset)) { 2406 struct brw_instruction *insn = store + offset; 2407 2408 switch (insn->header.opcode) { 2409 case BRW_OPCODE_ENDIF: 2410 case BRW_OPCODE_ELSE: 2411 case BRW_OPCODE_WHILE: 2412 case BRW_OPCODE_HALT: 2413 return offset; 2414 } 2415 } 2416 2417 return 0; 2418} 2419 2420/* There is no DO instruction on gen6, so to find the end of the loop 2421 * we have to see if the loop is jumping back before our start 2422 * instruction. 2423 */ 2424static int 2425brw_find_loop_end(struct brw_compile *p, int start_offset) 2426{ 2427 struct brw_context *brw = p->brw; 2428 int offset; 2429 int scale = 8; 2430 void *store = p->store; 2431 2432 /* Always start after the instruction (such as a WHILE) we're trying to fix 2433 * up. 2434 */ 2435 for (offset = next_offset(store, start_offset); offset < p->next_insn_offset; 2436 offset = next_offset(store, offset)) { 2437 struct brw_instruction *insn = store + offset; 2438 2439 if (insn->header.opcode == BRW_OPCODE_WHILE) { 2440 int jip = brw->gen == 6 ? insn->bits1.branch_gen6.jump_count 2441 : insn->bits3.break_cont.jip; 2442 if (offset + jip * scale <= start_offset) 2443 return offset; 2444 } 2445 } 2446 assert(!"not reached"); 2447 return start_offset; 2448} 2449 2450/* After program generation, go back and update the UIP and JIP of 2451 * BREAK, CONT, and HALT instructions to their correct locations. 2452 */ 2453void 2454brw_set_uip_jip(struct brw_compile *p) 2455{ 2456 struct brw_context *brw = p->brw; 2457 int offset; 2458 int scale = 8; 2459 void *store = p->store; 2460 2461 if (brw->gen < 6) 2462 return; 2463 2464 for (offset = 0; offset < p->next_insn_offset; 2465 offset = next_offset(store, offset)) { 2466 struct brw_instruction *insn = store + offset; 2467 2468 if (insn->header.cmpt_control) { 2469 /* Fixups for compacted BREAK/CONTINUE not supported yet. */ 2470 assert(insn->header.opcode != BRW_OPCODE_BREAK && 2471 insn->header.opcode != BRW_OPCODE_CONTINUE && 2472 insn->header.opcode != BRW_OPCODE_HALT); 2473 continue; 2474 } 2475 2476 int block_end_offset = brw_find_next_block_end(p, offset); 2477 switch (insn->header.opcode) { 2478 case BRW_OPCODE_BREAK: 2479 assert(block_end_offset != 0); 2480 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale; 2481 /* Gen7 UIP points to WHILE; Gen6 points just after it */ 2482 insn->bits3.break_cont.uip = 2483 (brw_find_loop_end(p, offset) - offset + 2484 (brw->gen == 6 ? 16 : 0)) / scale; 2485 break; 2486 case BRW_OPCODE_CONTINUE: 2487 assert(block_end_offset != 0); 2488 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale; 2489 insn->bits3.break_cont.uip = 2490 (brw_find_loop_end(p, offset) - offset) / scale; 2491 2492 assert(insn->bits3.break_cont.uip != 0); 2493 assert(insn->bits3.break_cont.jip != 0); 2494 break; 2495 2496 case BRW_OPCODE_ENDIF: 2497 if (block_end_offset == 0) 2498 insn->bits3.break_cont.jip = 2; 2499 else 2500 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale; 2501 break; 2502 2503 case BRW_OPCODE_HALT: 2504 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19): 2505 * 2506 * "In case of the halt instruction not inside any conditional 2507 * code block, the value of <JIP> and <UIP> should be the 2508 * same. In case of the halt instruction inside conditional code 2509 * block, the <UIP> should be the end of the program, and the 2510 * <JIP> should be end of the most inner conditional code block." 2511 * 2512 * The uip will have already been set by whoever set up the 2513 * instruction. 2514 */ 2515 if (block_end_offset == 0) { 2516 insn->bits3.break_cont.jip = insn->bits3.break_cont.uip; 2517 } else { 2518 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale; 2519 } 2520 assert(insn->bits3.break_cont.uip != 0); 2521 assert(insn->bits3.break_cont.jip != 0); 2522 break; 2523 } 2524 } 2525} 2526 2527void brw_ff_sync(struct brw_compile *p, 2528 struct brw_reg dest, 2529 unsigned msg_reg_nr, 2530 struct brw_reg src0, 2531 bool allocate, 2532 unsigned response_length, 2533 bool eot) 2534{ 2535 struct brw_context *brw = p->brw; 2536 struct brw_instruction *insn; 2537 2538 gen6_resolve_implied_move(p, &src0, msg_reg_nr); 2539 2540 insn = next_insn(p, BRW_OPCODE_SEND); 2541 brw_set_dest(p, insn, dest); 2542 brw_set_src0(p, insn, src0); 2543 brw_set_src1(p, insn, brw_imm_d(0)); 2544 2545 if (brw->gen < 6) 2546 insn->header.destreg__conditionalmod = msg_reg_nr; 2547 2548 brw_set_ff_sync_message(p, 2549 insn, 2550 allocate, 2551 response_length, 2552 eot); 2553} 2554 2555/** 2556 * Emit the SEND instruction necessary to generate stream output data on Gen6 2557 * (for transform feedback). 2558 * 2559 * If send_commit_msg is true, this is the last piece of stream output data 2560 * from this thread, so send the data as a committed write. According to the 2561 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1): 2562 * 2563 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all 2564 * writes are complete by sending the final write as a committed write." 2565 */ 2566void 2567brw_svb_write(struct brw_compile *p, 2568 struct brw_reg dest, 2569 unsigned msg_reg_nr, 2570 struct brw_reg src0, 2571 unsigned binding_table_index, 2572 bool send_commit_msg) 2573{ 2574 struct brw_instruction *insn; 2575 2576 gen6_resolve_implied_move(p, &src0, msg_reg_nr); 2577 2578 insn = next_insn(p, BRW_OPCODE_SEND); 2579 brw_set_dest(p, insn, dest); 2580 brw_set_src0(p, insn, src0); 2581 brw_set_src1(p, insn, brw_imm_d(0)); 2582 brw_set_dp_write_message(p, insn, 2583 binding_table_index, 2584 0, /* msg_control: ignored */ 2585 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE, 2586 1, /* msg_length */ 2587 true, /* header_present */ 2588 0, /* last_render_target: ignored */ 2589 send_commit_msg, /* response_length */ 2590 0, /* end_of_thread */ 2591 send_commit_msg); /* send_commit_msg */ 2592} 2593 2594static void 2595brw_set_dp_untyped_atomic_message(struct brw_compile *p, 2596 struct brw_instruction *insn, 2597 unsigned atomic_op, 2598 unsigned bind_table_index, 2599 unsigned msg_length, 2600 unsigned response_length, 2601 bool header_present) 2602{ 2603 if (p->brw->is_haswell) { 2604 brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1, 2605 msg_length, response_length, 2606 header_present, false); 2607 2608 2609 if (insn->header.access_mode == BRW_ALIGN_1) { 2610 if (insn->header.execution_size != BRW_EXECUTE_16) 2611 insn->bits3.ud |= 1 << 12; /* SIMD8 mode */ 2612 2613 insn->bits3.gen7_dp.msg_type = 2614 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP; 2615 } else { 2616 insn->bits3.gen7_dp.msg_type = 2617 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2; 2618 } 2619 } else { 2620 brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE, 2621 msg_length, response_length, 2622 header_present, false); 2623 2624 insn->bits3.gen7_dp.msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP; 2625 2626 if (insn->header.execution_size != BRW_EXECUTE_16) 2627 insn->bits3.ud |= 1 << 12; /* SIMD8 mode */ 2628 } 2629 2630 if (response_length) 2631 insn->bits3.ud |= 1 << 13; /* Return data expected */ 2632 2633 insn->bits3.gen7_dp.binding_table_index = bind_table_index; 2634 insn->bits3.ud |= atomic_op << 8; 2635} 2636 2637void 2638brw_untyped_atomic(struct brw_compile *p, 2639 struct brw_reg dest, 2640 struct brw_reg mrf, 2641 unsigned atomic_op, 2642 unsigned bind_table_index, 2643 unsigned msg_length, 2644 unsigned response_length) { 2645 struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_SEND); 2646 2647 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD)); 2648 brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD)); 2649 brw_set_src1(p, insn, brw_imm_d(0)); 2650 brw_set_dp_untyped_atomic_message( 2651 p, insn, atomic_op, bind_table_index, msg_length, response_length, 2652 insn->header.access_mode == BRW_ALIGN_1); 2653} 2654 2655static void 2656brw_set_dp_untyped_surface_read_message(struct brw_compile *p, 2657 struct brw_instruction *insn, 2658 unsigned bind_table_index, 2659 unsigned msg_length, 2660 unsigned response_length, 2661 bool header_present) 2662{ 2663 const unsigned dispatch_width = 2664 (insn->header.execution_size == BRW_EXECUTE_16 ? 16 : 8); 2665 const unsigned num_channels = response_length / (dispatch_width / 8); 2666 2667 if (p->brw->is_haswell) { 2668 brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1, 2669 msg_length, response_length, 2670 header_present, false); 2671 2672 insn->bits3.gen7_dp.msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ; 2673 } else { 2674 brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE, 2675 msg_length, response_length, 2676 header_present, false); 2677 2678 insn->bits3.gen7_dp.msg_type = GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ; 2679 } 2680 2681 if (insn->header.access_mode == BRW_ALIGN_1) { 2682 if (dispatch_width == 16) 2683 insn->bits3.ud |= 1 << 12; /* SIMD16 mode */ 2684 else 2685 insn->bits3.ud |= 2 << 12; /* SIMD8 mode */ 2686 } 2687 2688 insn->bits3.gen7_dp.binding_table_index = bind_table_index; 2689 2690 /* Set mask of 32-bit channels to drop. */ 2691 insn->bits3.ud |= (0xf & (0xf << num_channels)) << 8; 2692} 2693 2694void 2695brw_untyped_surface_read(struct brw_compile *p, 2696 struct brw_reg dest, 2697 struct brw_reg mrf, 2698 unsigned bind_table_index, 2699 unsigned msg_length, 2700 unsigned response_length) 2701{ 2702 struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); 2703 2704 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD)); 2705 brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD)); 2706 brw_set_dp_untyped_surface_read_message( 2707 p, insn, bind_table_index, msg_length, response_length, 2708 insn->header.access_mode == BRW_ALIGN_1); 2709} 2710 2711/** 2712 * This instruction is generated as a single-channel align1 instruction by 2713 * both the VS and FS stages when using INTEL_DEBUG=shader_time. 2714 * 2715 * We can't use the typed atomic op in the FS because that has the execution 2716 * mask ANDed with the pixel mask, but we just want to write the one dword for 2717 * all the pixels. 2718 * 2719 * We don't use the SIMD4x2 atomic ops in the VS because want to just write 2720 * one u32. So we use the same untyped atomic write message as the pixel 2721 * shader. 2722 * 2723 * The untyped atomic operation requires a BUFFER surface type with RAW 2724 * format, and is only accessible through the legacy DATA_CACHE dataport 2725 * messages. 2726 */ 2727void brw_shader_time_add(struct brw_compile *p, 2728 struct brw_reg payload, 2729 uint32_t surf_index) 2730{ 2731 struct brw_context *brw = p->brw; 2732 assert(brw->gen >= 7); 2733 2734 brw_push_insn_state(p); 2735 brw_set_default_access_mode(p, BRW_ALIGN_1); 2736 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 2737 struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND); 2738 brw_pop_insn_state(p); 2739 2740 /* We use brw_vec1_reg and unmasked because we want to increment the given 2741 * offset only once. 2742 */ 2743 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE, 2744 BRW_ARF_NULL, 0)); 2745 brw_set_src0(p, send, brw_vec1_reg(payload.file, 2746 payload.nr, 0)); 2747 brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, surf_index, 2748 2 /* message length */, 2749 0 /* response length */, 2750 false /* header present */); 2751} 2752