brw_eu_emit.c revision dd7290cf59206c49f1a322d53baa9957b13d2949
1/* 2 Copyright (C) Intel Corp. 2006. All Rights Reserved. 3 Intel funded Tungsten Graphics to 4 develop this 3D driver. 5 6 Permission is hereby granted, free of charge, to any person obtaining 7 a copy of this software and associated documentation files (the 8 "Software"), to deal in the Software without restriction, including 9 without limitation the rights to use, copy, modify, merge, publish, 10 distribute, sublicense, and/or sell copies of the Software, and to 11 permit persons to whom the Software is furnished to do so, subject to 12 the following conditions: 13 14 The above copyright notice and this permission notice (including the 15 next paragraph) shall be included in all copies or substantial 16 portions of the Software. 17 18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE 22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 26 **********************************************************************/ 27 /* 28 * Authors: 29 * Keith Whitwell <keithw@vmware.com> 30 */ 31 32 33#include "brw_context.h" 34#include "brw_defines.h" 35#include "brw_eu.h" 36 37#include "util/ralloc.h" 38 39/** 40 * Prior to Sandybridge, the SEND instruction accepted non-MRF source 41 * registers, implicitly moving the operand to a message register. 42 * 43 * On Sandybridge, this is no longer the case. This function performs the 44 * explicit move; it should be called before emitting a SEND instruction. 45 */ 46void 47gen6_resolve_implied_move(struct brw_codegen *p, 48 struct brw_reg *src, 49 unsigned msg_reg_nr) 50{ 51 const struct brw_device_info *devinfo = p->devinfo; 52 if (devinfo->gen < 6) 53 return; 54 55 if (src->file == BRW_MESSAGE_REGISTER_FILE) 56 return; 57 58 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) { 59 brw_push_insn_state(p); 60 brw_set_default_exec_size(p, BRW_EXECUTE_8); 61 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 62 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 63 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD), 64 retype(*src, BRW_REGISTER_TYPE_UD)); 65 brw_pop_insn_state(p); 66 } 67 *src = brw_message_reg(msg_reg_nr); 68} 69 70static void 71gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg) 72{ 73 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"): 74 * "The send with EOT should use register space R112-R127 for <src>. This is 75 * to enable loading of a new thread into the same slot while the message 76 * with EOT for current thread is pending dispatch." 77 * 78 * Since we're pretending to have 16 MRFs anyway, we may as well use the 79 * registers required for messages with EOT. 80 */ 81 const struct brw_device_info *devinfo = p->devinfo; 82 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) { 83 reg->file = BRW_GENERAL_REGISTER_FILE; 84 reg->nr += GEN7_MRF_HACK_START; 85 } 86} 87 88/** 89 * Convert a brw_reg_type enumeration value into the hardware representation. 90 * 91 * The hardware encoding may depend on whether the value is an immediate. 92 */ 93unsigned 94brw_reg_type_to_hw_type(const struct brw_device_info *devinfo, 95 enum brw_reg_type type, unsigned file) 96{ 97 if (file == BRW_IMMEDIATE_VALUE) { 98 static const int imm_hw_types[] = { 99 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD, 100 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D, 101 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW, 102 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W, 103 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F, 104 [BRW_REGISTER_TYPE_UB] = -1, 105 [BRW_REGISTER_TYPE_B] = -1, 106 [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV, 107 [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF, 108 [BRW_REGISTER_TYPE_V] = BRW_HW_REG_IMM_TYPE_V, 109 [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF, 110 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF, 111 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ, 112 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q, 113 }; 114 assert(type < ARRAY_SIZE(imm_hw_types)); 115 assert(imm_hw_types[type] != -1); 116 assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_DF); 117 return imm_hw_types[type]; 118 } else { 119 /* Non-immediate registers */ 120 static const int hw_types[] = { 121 [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD, 122 [BRW_REGISTER_TYPE_D] = BRW_HW_REG_TYPE_D, 123 [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW, 124 [BRW_REGISTER_TYPE_W] = BRW_HW_REG_TYPE_W, 125 [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB, 126 [BRW_REGISTER_TYPE_B] = BRW_HW_REG_NON_IMM_TYPE_B, 127 [BRW_REGISTER_TYPE_F] = BRW_HW_REG_TYPE_F, 128 [BRW_REGISTER_TYPE_UV] = -1, 129 [BRW_REGISTER_TYPE_VF] = -1, 130 [BRW_REGISTER_TYPE_V] = -1, 131 [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF, 132 [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF, 133 [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ, 134 [BRW_REGISTER_TYPE_Q] = GEN8_HW_REG_TYPE_Q, 135 }; 136 assert(type < ARRAY_SIZE(hw_types)); 137 assert(hw_types[type] != -1); 138 assert(devinfo->gen >= 7 || type < BRW_REGISTER_TYPE_DF); 139 assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_HF); 140 return hw_types[type]; 141 } 142} 143 144void 145brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest) 146{ 147 const struct brw_device_info *devinfo = p->devinfo; 148 149 if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE && 150 dest.file != BRW_MESSAGE_REGISTER_FILE) 151 assert(dest.nr < 128); 152 153 gen7_convert_mrf_to_grf(p, &dest); 154 155 brw_inst_set_dst_reg_file(devinfo, inst, dest.file); 156 brw_inst_set_dst_reg_type(devinfo, inst, 157 brw_reg_type_to_hw_type(devinfo, dest.type, 158 dest.file)); 159 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode); 160 161 if (dest.address_mode == BRW_ADDRESS_DIRECT) { 162 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr); 163 164 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { 165 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr); 166 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0) 167 dest.hstride = BRW_HORIZONTAL_STRIDE_1; 168 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride); 169 } else { 170 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16); 171 brw_inst_set_da16_writemask(devinfo, inst, dest.dw1.bits.writemask); 172 if (dest.file == BRW_GENERAL_REGISTER_FILE || 173 dest.file == BRW_MESSAGE_REGISTER_FILE) { 174 assert(dest.dw1.bits.writemask != 0); 175 } 176 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1: 177 * Although Dst.HorzStride is a don't care for Align16, HW needs 178 * this to be programmed as "01". 179 */ 180 brw_inst_set_dst_hstride(devinfo, inst, 1); 181 } 182 } else { 183 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr); 184 185 /* These are different sizes in align1 vs align16: 186 */ 187 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { 188 brw_inst_set_dst_ia1_addr_imm(devinfo, inst, 189 dest.dw1.bits.indirect_offset); 190 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0) 191 dest.hstride = BRW_HORIZONTAL_STRIDE_1; 192 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride); 193 } else { 194 brw_inst_set_dst_ia16_addr_imm(devinfo, inst, 195 dest.dw1.bits.indirect_offset); 196 /* even ignored in da16, still need to set as '01' */ 197 brw_inst_set_dst_hstride(devinfo, inst, 1); 198 } 199 } 200 201 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8) 202 * or 16 (SIMD16), as that's normally correct. However, when dealing with 203 * small registers, we automatically reduce it to match the register size. 204 */ 205 if (dest.width < BRW_EXECUTE_8) 206 brw_inst_set_exec_size(devinfo, inst, dest.width); 207} 208 209extern int reg_type_size[]; 210 211static void 212validate_reg(const struct brw_device_info *devinfo, 213 brw_inst *inst, struct brw_reg reg) 214{ 215 const int hstride_for_reg[] = {0, 1, 2, 4}; 216 const int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32}; 217 const int width_for_reg[] = {1, 2, 4, 8, 16}; 218 const int execsize_for_reg[] = {1, 2, 4, 8, 16, 32}; 219 int width, hstride, vstride, execsize; 220 221 if (reg.file == BRW_IMMEDIATE_VALUE) { 222 /* 3.3.6: Region Parameters. Restriction: Immediate vectors 223 * mean the destination has to be 128-bit aligned and the 224 * destination horiz stride has to be a word. 225 */ 226 if (reg.type == BRW_REGISTER_TYPE_V) { 227 assert(hstride_for_reg[brw_inst_dst_hstride(devinfo, inst)] * 228 reg_type_size[brw_inst_dst_reg_type(devinfo, inst)] == 2); 229 } 230 231 return; 232 } 233 234 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE && 235 reg.file == BRW_ARF_NULL) 236 return; 237 238 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5: 239 * 240 * "Swizzling is not allowed when an accumulator is used as an implicit 241 * source or an explicit source in an instruction." 242 */ 243 if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE && 244 reg.nr == BRW_ARF_ACCUMULATOR) 245 assert(reg.dw1.bits.swizzle == BRW_SWIZZLE_XYZW); 246 247 assert(reg.hstride >= 0 && reg.hstride < ARRAY_SIZE(hstride_for_reg)); 248 hstride = hstride_for_reg[reg.hstride]; 249 250 if (reg.vstride == 0xf) { 251 vstride = -1; 252 } else { 253 assert(reg.vstride >= 0 && reg.vstride < ARRAY_SIZE(vstride_for_reg)); 254 vstride = vstride_for_reg[reg.vstride]; 255 } 256 257 assert(reg.width >= 0 && reg.width < ARRAY_SIZE(width_for_reg)); 258 width = width_for_reg[reg.width]; 259 260 assert(brw_inst_exec_size(devinfo, inst) >= 0 && 261 brw_inst_exec_size(devinfo, inst) < ARRAY_SIZE(execsize_for_reg)); 262 execsize = execsize_for_reg[brw_inst_exec_size(devinfo, inst)]; 263 264 /* Restrictions from 3.3.10: Register Region Restrictions. */ 265 /* 3. */ 266 assert(execsize >= width); 267 268 /* 4. */ 269 if (execsize == width && hstride != 0) { 270 assert(vstride == -1 || vstride == width * hstride); 271 } 272 273 /* 5. */ 274 if (execsize == width && hstride == 0) { 275 /* no restriction on vstride. */ 276 } 277 278 /* 6. */ 279 if (width == 1) { 280 assert(hstride == 0); 281 } 282 283 /* 7. */ 284 if (execsize == 1 && width == 1) { 285 assert(hstride == 0); 286 assert(vstride == 0); 287 } 288 289 /* 8. */ 290 if (vstride == 0 && hstride == 0) { 291 assert(width == 1); 292 } 293 294 /* 10. Check destination issues. */ 295} 296 297static bool 298is_compactable_immediate(unsigned imm) 299{ 300 /* We get the low 12 bits as-is. */ 301 imm &= ~0xfff; 302 303 /* We get one bit replicated through the top 20 bits. */ 304 return imm == 0 || imm == 0xfffff000; 305} 306 307void 308brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg) 309{ 310 const struct brw_device_info *devinfo = p->devinfo; 311 312 if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE) 313 assert(reg.nr < 128); 314 315 gen7_convert_mrf_to_grf(p, ®); 316 317 if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND || 318 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) { 319 /* Any source modifiers or regions will be ignored, since this just 320 * identifies the MRF/GRF to start reading the message contents from. 321 * Check for some likely failures. 322 */ 323 assert(!reg.negate); 324 assert(!reg.abs); 325 assert(reg.address_mode == BRW_ADDRESS_DIRECT); 326 } 327 328 validate_reg(devinfo, inst, reg); 329 330 brw_inst_set_src0_reg_file(devinfo, inst, reg.file); 331 brw_inst_set_src0_reg_type(devinfo, inst, 332 brw_reg_type_to_hw_type(devinfo, reg.type, reg.file)); 333 brw_inst_set_src0_abs(devinfo, inst, reg.abs); 334 brw_inst_set_src0_negate(devinfo, inst, reg.negate); 335 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode); 336 337 if (reg.file == BRW_IMMEDIATE_VALUE) { 338 brw_inst_set_imm_ud(devinfo, inst, reg.dw1.ud); 339 340 /* The Bspec's section titled "Non-present Operands" claims that if src0 341 * is an immediate that src1's type must be the same as that of src0. 342 * 343 * The SNB+ DataTypeIndex instruction compaction tables contain mappings 344 * that do not follow this rule. E.g., from the IVB/HSW table: 345 * 346 * DataTypeIndex 18-Bit Mapping Mapped Meaning 347 * 3 001000001011111101 r:f | i:vf | a:ud | <1> | dir | 348 * 349 * And from the SNB table: 350 * 351 * DataTypeIndex 18-Bit Mapping Mapped Meaning 352 * 8 001000000111101100 a:w | i:w | a:ud | <1> | dir | 353 * 354 * Neither of these cause warnings from the simulator when used, 355 * compacted or otherwise. In fact, all compaction mappings that have an 356 * immediate in src0 use a:ud for src1. 357 * 358 * The GM45 instruction compaction tables do not contain mapped meanings 359 * so it's not clear whether it has the restriction. We'll assume it was 360 * lifted on SNB. (FINISHME: decode the GM45 tables and check.) 361 */ 362 brw_inst_set_src1_reg_file(devinfo, inst, BRW_ARCHITECTURE_REGISTER_FILE); 363 if (devinfo->gen < 6) { 364 brw_inst_set_src1_reg_type(devinfo, inst, 365 brw_inst_src0_reg_type(devinfo, inst)); 366 } else { 367 brw_inst_set_src1_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD); 368 } 369 370 /* Compacted instructions only have 12-bits (plus 1 for the other 20) 371 * for immediate values. Presumably the hardware engineers realized 372 * that the only useful floating-point value that could be represented 373 * in this format is 0.0, which can also be represented as a VF-typed 374 * immediate, so they gave us the previously mentioned mapping on IVB+. 375 * 376 * Strangely, we do have a mapping for imm:f in src1, so we don't need 377 * to do this there. 378 * 379 * If we see a 0.0:F, change the type to VF so that it can be compacted. 380 */ 381 if (brw_inst_imm_ud(devinfo, inst) == 0x0 && 382 brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_F) { 383 brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_IMM_TYPE_VF); 384 } 385 386 /* There are no mappings for dst:d | i:d, so if the immediate is suitable 387 * set the types to :UD so the instruction can be compacted. 388 */ 389 if (is_compactable_immediate(brw_inst_imm_ud(devinfo, inst)) && 390 brw_inst_cond_modifier(devinfo, inst) == BRW_CONDITIONAL_NONE && 391 brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D && 392 brw_inst_dst_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D) { 393 brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD); 394 brw_inst_set_dst_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD); 395 } 396 } else { 397 if (reg.address_mode == BRW_ADDRESS_DIRECT) { 398 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr); 399 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { 400 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr); 401 } else { 402 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16); 403 } 404 } else { 405 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr); 406 407 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { 408 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.dw1.bits.indirect_offset); 409 } else { 410 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.dw1.bits.indirect_offset); 411 } 412 } 413 414 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { 415 if (reg.width == BRW_WIDTH_1 && 416 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) { 417 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0); 418 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1); 419 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0); 420 } else { 421 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride); 422 brw_inst_set_src0_width(devinfo, inst, reg.width); 423 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride); 424 } 425 } else { 426 brw_inst_set_src0_da16_swiz_x(devinfo, inst, 427 BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X)); 428 brw_inst_set_src0_da16_swiz_y(devinfo, inst, 429 BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y)); 430 brw_inst_set_src0_da16_swiz_z(devinfo, inst, 431 BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z)); 432 brw_inst_set_src0_da16_swiz_w(devinfo, inst, 433 BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W)); 434 435 /* This is an oddity of the fact we're using the same 436 * descriptions for registers in align_16 as align_1: 437 */ 438 if (reg.vstride == BRW_VERTICAL_STRIDE_8) 439 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4); 440 else 441 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride); 442 } 443 } 444} 445 446 447void 448brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg) 449{ 450 const struct brw_device_info *devinfo = p->devinfo; 451 452 if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE) 453 assert(reg.nr < 128); 454 455 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5: 456 * 457 * "Accumulator registers may be accessed explicitly as src0 458 * operands only." 459 */ 460 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE || 461 reg.nr != BRW_ARF_ACCUMULATOR); 462 463 gen7_convert_mrf_to_grf(p, ®); 464 assert(reg.file != BRW_MESSAGE_REGISTER_FILE); 465 466 validate_reg(devinfo, inst, reg); 467 468 brw_inst_set_src1_reg_file(devinfo, inst, reg.file); 469 brw_inst_set_src1_reg_type(devinfo, inst, 470 brw_reg_type_to_hw_type(devinfo, reg.type, reg.file)); 471 brw_inst_set_src1_abs(devinfo, inst, reg.abs); 472 brw_inst_set_src1_negate(devinfo, inst, reg.negate); 473 474 /* Only src1 can be immediate in two-argument instructions. 475 */ 476 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE); 477 478 if (reg.file == BRW_IMMEDIATE_VALUE) { 479 brw_inst_set_imm_ud(devinfo, inst, reg.dw1.ud); 480 } else { 481 /* This is a hardware restriction, which may or may not be lifted 482 * in the future: 483 */ 484 assert (reg.address_mode == BRW_ADDRESS_DIRECT); 485 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */ 486 487 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr); 488 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { 489 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr); 490 } else { 491 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16); 492 } 493 494 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { 495 if (reg.width == BRW_WIDTH_1 && 496 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) { 497 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0); 498 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1); 499 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0); 500 } else { 501 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride); 502 brw_inst_set_src1_width(devinfo, inst, reg.width); 503 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride); 504 } 505 } else { 506 brw_inst_set_src1_da16_swiz_x(devinfo, inst, 507 BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X)); 508 brw_inst_set_src1_da16_swiz_y(devinfo, inst, 509 BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y)); 510 brw_inst_set_src1_da16_swiz_z(devinfo, inst, 511 BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z)); 512 brw_inst_set_src1_da16_swiz_w(devinfo, inst, 513 BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W)); 514 515 /* This is an oddity of the fact we're using the same 516 * descriptions for registers in align_16 as align_1: 517 */ 518 if (reg.vstride == BRW_VERTICAL_STRIDE_8) 519 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4); 520 else 521 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride); 522 } 523 } 524} 525 526/** 527 * Set the Message Descriptor and Extended Message Descriptor fields 528 * for SEND messages. 529 * 530 * \note This zeroes out the Function Control bits, so it must be called 531 * \b before filling out any message-specific data. Callers can 532 * choose not to fill in irrelevant bits; they will be zero. 533 */ 534static void 535brw_set_message_descriptor(struct brw_codegen *p, 536 brw_inst *inst, 537 enum brw_message_target sfid, 538 unsigned msg_length, 539 unsigned response_length, 540 bool header_present, 541 bool end_of_thread) 542{ 543 const struct brw_device_info *devinfo = p->devinfo; 544 545 brw_set_src1(p, inst, brw_imm_d(0)); 546 547 /* For indirect sends, `inst` will not be the SEND/SENDC instruction 548 * itself; instead, it will be a MOV/OR into the address register. 549 * 550 * In this case, we avoid setting the extended message descriptor bits, 551 * since they go on the later SEND/SENDC instead and if set here would 552 * instead clobber the conditionalmod bits. 553 */ 554 unsigned opcode = brw_inst_opcode(devinfo, inst); 555 if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) { 556 brw_inst_set_sfid(devinfo, inst, sfid); 557 } 558 559 brw_inst_set_mlen(devinfo, inst, msg_length); 560 brw_inst_set_rlen(devinfo, inst, response_length); 561 brw_inst_set_eot(devinfo, inst, end_of_thread); 562 563 if (devinfo->gen >= 5) { 564 brw_inst_set_header_present(devinfo, inst, header_present); 565 } 566} 567 568static void brw_set_math_message( struct brw_codegen *p, 569 brw_inst *inst, 570 unsigned function, 571 unsigned integer_type, 572 bool low_precision, 573 unsigned dataType ) 574{ 575 const struct brw_device_info *devinfo = p->devinfo; 576 unsigned msg_length; 577 unsigned response_length; 578 579 /* Infer message length from the function */ 580 switch (function) { 581 case BRW_MATH_FUNCTION_POW: 582 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT: 583 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER: 584 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER: 585 msg_length = 2; 586 break; 587 default: 588 msg_length = 1; 589 break; 590 } 591 592 /* Infer response length from the function */ 593 switch (function) { 594 case BRW_MATH_FUNCTION_SINCOS: 595 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER: 596 response_length = 2; 597 break; 598 default: 599 response_length = 1; 600 break; 601 } 602 603 604 brw_set_message_descriptor(p, inst, BRW_SFID_MATH, 605 msg_length, response_length, false, false); 606 brw_inst_set_math_msg_function(devinfo, inst, function); 607 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type); 608 brw_inst_set_math_msg_precision(devinfo, inst, low_precision); 609 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst)); 610 brw_inst_set_math_msg_data_type(devinfo, inst, dataType); 611 brw_inst_set_saturate(devinfo, inst, 0); 612} 613 614 615static void brw_set_ff_sync_message(struct brw_codegen *p, 616 brw_inst *insn, 617 bool allocate, 618 unsigned response_length, 619 bool end_of_thread) 620{ 621 const struct brw_device_info *devinfo = p->devinfo; 622 623 brw_set_message_descriptor(p, insn, BRW_SFID_URB, 624 1, response_length, true, end_of_thread); 625 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */ 626 brw_inst_set_urb_allocate(devinfo, insn, allocate); 627 /* The following fields are not used by FF_SYNC: */ 628 brw_inst_set_urb_global_offset(devinfo, insn, 0); 629 brw_inst_set_urb_swizzle_control(devinfo, insn, 0); 630 brw_inst_set_urb_used(devinfo, insn, 0); 631 brw_inst_set_urb_complete(devinfo, insn, 0); 632} 633 634static void brw_set_urb_message( struct brw_codegen *p, 635 brw_inst *insn, 636 enum brw_urb_write_flags flags, 637 unsigned msg_length, 638 unsigned response_length, 639 unsigned offset, 640 unsigned swizzle_control ) 641{ 642 const struct brw_device_info *devinfo = p->devinfo; 643 644 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE); 645 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE)); 646 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET)); 647 648 brw_set_message_descriptor(p, insn, BRW_SFID_URB, 649 msg_length, response_length, true, 650 flags & BRW_URB_WRITE_EOT); 651 652 if (flags & BRW_URB_WRITE_OWORD) { 653 assert(msg_length == 2); /* header + one OWORD of data */ 654 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD); 655 } else { 656 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD); 657 } 658 659 brw_inst_set_urb_global_offset(devinfo, insn, offset); 660 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control); 661 662 if (devinfo->gen < 8) { 663 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE)); 664 } 665 666 if (devinfo->gen < 7) { 667 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE)); 668 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED)); 669 } else { 670 brw_inst_set_urb_per_slot_offset(devinfo, insn, 671 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET)); 672 } 673} 674 675void 676brw_set_dp_write_message(struct brw_codegen *p, 677 brw_inst *insn, 678 unsigned binding_table_index, 679 unsigned msg_control, 680 unsigned msg_type, 681 unsigned msg_length, 682 bool header_present, 683 unsigned last_render_target, 684 unsigned response_length, 685 unsigned end_of_thread, 686 unsigned send_commit_msg) 687{ 688 const struct brw_device_info *devinfo = p->devinfo; 689 unsigned sfid; 690 691 if (devinfo->gen >= 7) { 692 /* Use the Render Cache for RT writes; otherwise use the Data Cache */ 693 if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE) 694 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE; 695 else 696 sfid = GEN7_SFID_DATAPORT_DATA_CACHE; 697 } else if (devinfo->gen == 6) { 698 /* Use the render cache for all write messages. */ 699 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE; 700 } else { 701 sfid = BRW_SFID_DATAPORT_WRITE; 702 } 703 704 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length, 705 header_present, end_of_thread); 706 707 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index); 708 brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type); 709 brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control); 710 brw_inst_set_rt_last(devinfo, insn, last_render_target); 711 if (devinfo->gen < 7) { 712 brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg); 713 } 714} 715 716void 717brw_set_dp_read_message(struct brw_codegen *p, 718 brw_inst *insn, 719 unsigned binding_table_index, 720 unsigned msg_control, 721 unsigned msg_type, 722 unsigned target_cache, 723 unsigned msg_length, 724 bool header_present, 725 unsigned response_length) 726{ 727 const struct brw_device_info *devinfo = p->devinfo; 728 unsigned sfid; 729 730 if (devinfo->gen >= 7) { 731 sfid = GEN7_SFID_DATAPORT_DATA_CACHE; 732 } else if (devinfo->gen == 6) { 733 if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE) 734 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE; 735 else 736 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE; 737 } else { 738 sfid = BRW_SFID_DATAPORT_READ; 739 } 740 741 brw_set_message_descriptor(p, insn, sfid, msg_length, response_length, 742 header_present, false); 743 744 brw_inst_set_binding_table_index(devinfo, insn, binding_table_index); 745 brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type); 746 brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control); 747 if (devinfo->gen < 6) 748 brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache); 749} 750 751void 752brw_set_sampler_message(struct brw_codegen *p, 753 brw_inst *inst, 754 unsigned binding_table_index, 755 unsigned sampler, 756 unsigned msg_type, 757 unsigned response_length, 758 unsigned msg_length, 759 unsigned header_present, 760 unsigned simd_mode, 761 unsigned return_format) 762{ 763 const struct brw_device_info *devinfo = p->devinfo; 764 765 brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length, 766 response_length, header_present, false); 767 768 brw_inst_set_binding_table_index(devinfo, inst, binding_table_index); 769 brw_inst_set_sampler(devinfo, inst, sampler); 770 brw_inst_set_sampler_msg_type(devinfo, inst, msg_type); 771 if (devinfo->gen >= 5) { 772 brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode); 773 } else if (devinfo->gen == 4 && !devinfo->is_g4x) { 774 brw_inst_set_sampler_return_format(devinfo, inst, return_format); 775 } 776} 777 778static void 779gen7_set_dp_scratch_message(struct brw_codegen *p, 780 brw_inst *inst, 781 bool write, 782 bool dword, 783 bool invalidate_after_read, 784 unsigned num_regs, 785 unsigned addr_offset, 786 unsigned mlen, 787 unsigned rlen, 788 bool header_present) 789{ 790 const struct brw_device_info *devinfo = p->devinfo; 791 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 || 792 (devinfo->gen >= 8 && num_regs == 8)); 793 brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE, 794 mlen, rlen, header_present, false); 795 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */ 796 brw_inst_set_scratch_read_write(devinfo, inst, write); 797 brw_inst_set_scratch_type(devinfo, inst, dword); 798 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read); 799 brw_inst_set_scratch_block_size(devinfo, inst, ffs(num_regs) - 1); 800 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset); 801} 802 803#define next_insn brw_next_insn 804brw_inst * 805brw_next_insn(struct brw_codegen *p, unsigned opcode) 806{ 807 const struct brw_device_info *devinfo = p->devinfo; 808 brw_inst *insn; 809 810 if (p->nr_insn + 1 > p->store_size) { 811 p->store_size <<= 1; 812 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size); 813 } 814 815 p->next_insn_offset += 16; 816 insn = &p->store[p->nr_insn++]; 817 memcpy(insn, p->current, sizeof(*insn)); 818 819 brw_inst_set_opcode(devinfo, insn, opcode); 820 return insn; 821} 822 823static brw_inst * 824brw_alu1(struct brw_codegen *p, unsigned opcode, 825 struct brw_reg dest, struct brw_reg src) 826{ 827 brw_inst *insn = next_insn(p, opcode); 828 brw_set_dest(p, insn, dest); 829 brw_set_src0(p, insn, src); 830 return insn; 831} 832 833static brw_inst * 834brw_alu2(struct brw_codegen *p, unsigned opcode, 835 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1) 836{ 837 brw_inst *insn = next_insn(p, opcode); 838 brw_set_dest(p, insn, dest); 839 brw_set_src0(p, insn, src0); 840 brw_set_src1(p, insn, src1); 841 return insn; 842} 843 844static int 845get_3src_subreg_nr(struct brw_reg reg) 846{ 847 if (reg.vstride == BRW_VERTICAL_STRIDE_0) { 848 assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle)); 849 return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0); 850 } else { 851 return reg.subnr / 4; 852 } 853} 854 855static brw_inst * 856brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest, 857 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2) 858{ 859 const struct brw_device_info *devinfo = p->devinfo; 860 brw_inst *inst = next_insn(p, opcode); 861 862 gen7_convert_mrf_to_grf(p, &dest); 863 864 assert(brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16); 865 866 assert(dest.file == BRW_GENERAL_REGISTER_FILE || 867 dest.file == BRW_MESSAGE_REGISTER_FILE); 868 assert(dest.nr < 128); 869 assert(dest.address_mode == BRW_ADDRESS_DIRECT); 870 assert(dest.type == BRW_REGISTER_TYPE_F || 871 dest.type == BRW_REGISTER_TYPE_D || 872 dest.type == BRW_REGISTER_TYPE_UD); 873 if (devinfo->gen == 6) { 874 brw_inst_set_3src_dst_reg_file(devinfo, inst, 875 dest.file == BRW_MESSAGE_REGISTER_FILE); 876 } 877 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr); 878 brw_inst_set_3src_dst_subreg_nr(devinfo, inst, dest.subnr / 16); 879 brw_inst_set_3src_dst_writemask(devinfo, inst, dest.dw1.bits.writemask); 880 881 assert(src0.file == BRW_GENERAL_REGISTER_FILE); 882 assert(src0.address_mode == BRW_ADDRESS_DIRECT); 883 assert(src0.nr < 128); 884 brw_inst_set_3src_src0_swizzle(devinfo, inst, src0.dw1.bits.swizzle); 885 brw_inst_set_3src_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0)); 886 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr); 887 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs); 888 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate); 889 brw_inst_set_3src_src0_rep_ctrl(devinfo, inst, 890 src0.vstride == BRW_VERTICAL_STRIDE_0); 891 892 assert(src1.file == BRW_GENERAL_REGISTER_FILE); 893 assert(src1.address_mode == BRW_ADDRESS_DIRECT); 894 assert(src1.nr < 128); 895 brw_inst_set_3src_src1_swizzle(devinfo, inst, src1.dw1.bits.swizzle); 896 brw_inst_set_3src_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1)); 897 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr); 898 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs); 899 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate); 900 brw_inst_set_3src_src1_rep_ctrl(devinfo, inst, 901 src1.vstride == BRW_VERTICAL_STRIDE_0); 902 903 assert(src2.file == BRW_GENERAL_REGISTER_FILE); 904 assert(src2.address_mode == BRW_ADDRESS_DIRECT); 905 assert(src2.nr < 128); 906 brw_inst_set_3src_src2_swizzle(devinfo, inst, src2.dw1.bits.swizzle); 907 brw_inst_set_3src_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2)); 908 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr); 909 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs); 910 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate); 911 brw_inst_set_3src_src2_rep_ctrl(devinfo, inst, 912 src2.vstride == BRW_VERTICAL_STRIDE_0); 913 914 if (devinfo->gen >= 7) { 915 /* Set both the source and destination types based on dest.type, 916 * ignoring the source register types. The MAD and LRP emitters ensure 917 * that all four types are float. The BFE and BFI2 emitters, however, 918 * may send us mixed D and UD types and want us to ignore that and use 919 * the destination type. 920 */ 921 switch (dest.type) { 922 case BRW_REGISTER_TYPE_F: 923 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_F); 924 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_F); 925 break; 926 case BRW_REGISTER_TYPE_D: 927 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_D); 928 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_D); 929 break; 930 case BRW_REGISTER_TYPE_UD: 931 brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_UD); 932 brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_UD); 933 break; 934 default: 935 unreachable("not reached"); 936 } 937 } 938 939 return inst; 940} 941 942 943/*********************************************************************** 944 * Convenience routines. 945 */ 946#define ALU1(OP) \ 947brw_inst *brw_##OP(struct brw_codegen *p, \ 948 struct brw_reg dest, \ 949 struct brw_reg src0) \ 950{ \ 951 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \ 952} 953 954#define ALU2(OP) \ 955brw_inst *brw_##OP(struct brw_codegen *p, \ 956 struct brw_reg dest, \ 957 struct brw_reg src0, \ 958 struct brw_reg src1) \ 959{ \ 960 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \ 961} 962 963#define ALU3(OP) \ 964brw_inst *brw_##OP(struct brw_codegen *p, \ 965 struct brw_reg dest, \ 966 struct brw_reg src0, \ 967 struct brw_reg src1, \ 968 struct brw_reg src2) \ 969{ \ 970 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \ 971} 972 973#define ALU3F(OP) \ 974brw_inst *brw_##OP(struct brw_codegen *p, \ 975 struct brw_reg dest, \ 976 struct brw_reg src0, \ 977 struct brw_reg src1, \ 978 struct brw_reg src2) \ 979{ \ 980 assert(dest.type == BRW_REGISTER_TYPE_F); \ 981 assert(src0.type == BRW_REGISTER_TYPE_F); \ 982 assert(src1.type == BRW_REGISTER_TYPE_F); \ 983 assert(src2.type == BRW_REGISTER_TYPE_F); \ 984 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \ 985} 986 987/* Rounding operations (other than RNDD) require two instructions - the first 988 * stores a rounded value (possibly the wrong way) in the dest register, but 989 * also sets a per-channel "increment bit" in the flag register. A predicated 990 * add of 1.0 fixes dest to contain the desired result. 991 * 992 * Sandybridge and later appear to round correctly without an ADD. 993 */ 994#define ROUND(OP) \ 995void brw_##OP(struct brw_codegen *p, \ 996 struct brw_reg dest, \ 997 struct brw_reg src) \ 998{ \ 999 const struct brw_device_info *devinfo = p->devinfo; \ 1000 brw_inst *rnd, *add; \ 1001 rnd = next_insn(p, BRW_OPCODE_##OP); \ 1002 brw_set_dest(p, rnd, dest); \ 1003 brw_set_src0(p, rnd, src); \ 1004 \ 1005 if (devinfo->gen < 6) { \ 1006 /* turn on round-increments */ \ 1007 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \ 1008 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \ 1009 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \ 1010 } \ 1011} 1012 1013 1014ALU1(MOV) 1015ALU2(SEL) 1016ALU1(NOT) 1017ALU2(AND) 1018ALU2(OR) 1019ALU2(XOR) 1020ALU2(SHR) 1021ALU2(SHL) 1022ALU2(ASR) 1023ALU1(FRC) 1024ALU1(RNDD) 1025ALU2(MAC) 1026ALU2(MACH) 1027ALU1(LZD) 1028ALU2(DP4) 1029ALU2(DPH) 1030ALU2(DP3) 1031ALU2(DP2) 1032ALU3F(MAD) 1033ALU3F(LRP) 1034ALU1(BFREV) 1035ALU3(BFE) 1036ALU2(BFI1) 1037ALU3(BFI2) 1038ALU1(FBH) 1039ALU1(FBL) 1040ALU1(CBIT) 1041ALU2(ADDC) 1042ALU2(SUBB) 1043 1044ROUND(RNDZ) 1045ROUND(RNDE) 1046 1047 1048brw_inst * 1049brw_ADD(struct brw_codegen *p, struct brw_reg dest, 1050 struct brw_reg src0, struct brw_reg src1) 1051{ 1052 /* 6.2.2: add */ 1053 if (src0.type == BRW_REGISTER_TYPE_F || 1054 (src0.file == BRW_IMMEDIATE_VALUE && 1055 src0.type == BRW_REGISTER_TYPE_VF)) { 1056 assert(src1.type != BRW_REGISTER_TYPE_UD); 1057 assert(src1.type != BRW_REGISTER_TYPE_D); 1058 } 1059 1060 if (src1.type == BRW_REGISTER_TYPE_F || 1061 (src1.file == BRW_IMMEDIATE_VALUE && 1062 src1.type == BRW_REGISTER_TYPE_VF)) { 1063 assert(src0.type != BRW_REGISTER_TYPE_UD); 1064 assert(src0.type != BRW_REGISTER_TYPE_D); 1065 } 1066 1067 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1); 1068} 1069 1070brw_inst * 1071brw_AVG(struct brw_codegen *p, struct brw_reg dest, 1072 struct brw_reg src0, struct brw_reg src1) 1073{ 1074 assert(dest.type == src0.type); 1075 assert(src0.type == src1.type); 1076 switch (src0.type) { 1077 case BRW_REGISTER_TYPE_B: 1078 case BRW_REGISTER_TYPE_UB: 1079 case BRW_REGISTER_TYPE_W: 1080 case BRW_REGISTER_TYPE_UW: 1081 case BRW_REGISTER_TYPE_D: 1082 case BRW_REGISTER_TYPE_UD: 1083 break; 1084 default: 1085 unreachable("Bad type for brw_AVG"); 1086 } 1087 1088 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1); 1089} 1090 1091brw_inst * 1092brw_MUL(struct brw_codegen *p, struct brw_reg dest, 1093 struct brw_reg src0, struct brw_reg src1) 1094{ 1095 /* 6.32.38: mul */ 1096 if (src0.type == BRW_REGISTER_TYPE_D || 1097 src0.type == BRW_REGISTER_TYPE_UD || 1098 src1.type == BRW_REGISTER_TYPE_D || 1099 src1.type == BRW_REGISTER_TYPE_UD) { 1100 assert(dest.type != BRW_REGISTER_TYPE_F); 1101 } 1102 1103 if (src0.type == BRW_REGISTER_TYPE_F || 1104 (src0.file == BRW_IMMEDIATE_VALUE && 1105 src0.type == BRW_REGISTER_TYPE_VF)) { 1106 assert(src1.type != BRW_REGISTER_TYPE_UD); 1107 assert(src1.type != BRW_REGISTER_TYPE_D); 1108 } 1109 1110 if (src1.type == BRW_REGISTER_TYPE_F || 1111 (src1.file == BRW_IMMEDIATE_VALUE && 1112 src1.type == BRW_REGISTER_TYPE_VF)) { 1113 assert(src0.type != BRW_REGISTER_TYPE_UD); 1114 assert(src0.type != BRW_REGISTER_TYPE_D); 1115 } 1116 1117 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE || 1118 src0.nr != BRW_ARF_ACCUMULATOR); 1119 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE || 1120 src1.nr != BRW_ARF_ACCUMULATOR); 1121 1122 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1); 1123} 1124 1125brw_inst * 1126brw_LINE(struct brw_codegen *p, struct brw_reg dest, 1127 struct brw_reg src0, struct brw_reg src1) 1128{ 1129 src0.vstride = BRW_VERTICAL_STRIDE_0; 1130 src0.width = BRW_WIDTH_1; 1131 src0.hstride = BRW_HORIZONTAL_STRIDE_0; 1132 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1); 1133} 1134 1135brw_inst * 1136brw_PLN(struct brw_codegen *p, struct brw_reg dest, 1137 struct brw_reg src0, struct brw_reg src1) 1138{ 1139 src0.vstride = BRW_VERTICAL_STRIDE_0; 1140 src0.width = BRW_WIDTH_1; 1141 src0.hstride = BRW_HORIZONTAL_STRIDE_0; 1142 src1.vstride = BRW_VERTICAL_STRIDE_8; 1143 src1.width = BRW_WIDTH_8; 1144 src1.hstride = BRW_HORIZONTAL_STRIDE_1; 1145 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1); 1146} 1147 1148brw_inst * 1149brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src) 1150{ 1151 const struct brw_device_info *devinfo = p->devinfo; 1152 const bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16; 1153 /* The F32TO16 instruction doesn't support 32-bit destination types in 1154 * Align1 mode, and neither does the Gen8 implementation in terms of a 1155 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as 1156 * an undocumented feature. 1157 */ 1158 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD && 1159 (!align16 || devinfo->gen >= 8)); 1160 brw_inst *inst; 1161 1162 if (align16) { 1163 assert(dst.type == BRW_REGISTER_TYPE_UD); 1164 } else { 1165 assert(dst.type == BRW_REGISTER_TYPE_UD || 1166 dst.type == BRW_REGISTER_TYPE_W || 1167 dst.type == BRW_REGISTER_TYPE_UW || 1168 dst.type == BRW_REGISTER_TYPE_HF); 1169 } 1170 1171 brw_push_insn_state(p); 1172 1173 if (needs_zero_fill) { 1174 brw_set_default_access_mode(p, BRW_ALIGN_1); 1175 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2); 1176 } 1177 1178 if (devinfo->gen >= 8) { 1179 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src); 1180 } else { 1181 assert(devinfo->gen == 7); 1182 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src); 1183 } 1184 1185 if (needs_zero_fill) { 1186 brw_inst_set_no_dd_clear(devinfo, inst, true); 1187 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_ud(0u)); 1188 brw_inst_set_no_dd_check(devinfo, inst, true); 1189 } 1190 1191 brw_pop_insn_state(p); 1192 return inst; 1193} 1194 1195brw_inst * 1196brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src) 1197{ 1198 const struct brw_device_info *devinfo = p->devinfo; 1199 bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16; 1200 1201 if (align16) { 1202 assert(src.type == BRW_REGISTER_TYPE_UD); 1203 } else { 1204 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32: 1205 * 1206 * Because this instruction does not have a 16-bit floating-point 1207 * type, the source data type must be Word (W). The destination type 1208 * must be F (Float). 1209 */ 1210 if (src.type == BRW_REGISTER_TYPE_UD) 1211 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2); 1212 1213 assert(src.type == BRW_REGISTER_TYPE_W || 1214 src.type == BRW_REGISTER_TYPE_UW || 1215 src.type == BRW_REGISTER_TYPE_HF); 1216 } 1217 1218 if (devinfo->gen >= 8) { 1219 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF)); 1220 } else { 1221 assert(devinfo->gen == 7); 1222 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src); 1223 } 1224} 1225 1226 1227void brw_NOP(struct brw_codegen *p) 1228{ 1229 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP); 1230 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD)); 1231 brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD)); 1232 brw_set_src1(p, insn, brw_imm_ud(0x0)); 1233} 1234 1235 1236 1237 1238 1239/*********************************************************************** 1240 * Comparisons, if/else/endif 1241 */ 1242 1243brw_inst * 1244brw_JMPI(struct brw_codegen *p, struct brw_reg index, 1245 unsigned predicate_control) 1246{ 1247 const struct brw_device_info *devinfo = p->devinfo; 1248 struct brw_reg ip = brw_ip_reg(); 1249 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index); 1250 1251 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_2); 1252 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE); 1253 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE); 1254 brw_inst_set_pred_control(devinfo, inst, predicate_control); 1255 1256 return inst; 1257} 1258 1259static void 1260push_if_stack(struct brw_codegen *p, brw_inst *inst) 1261{ 1262 p->if_stack[p->if_stack_depth] = inst - p->store; 1263 1264 p->if_stack_depth++; 1265 if (p->if_stack_array_size <= p->if_stack_depth) { 1266 p->if_stack_array_size *= 2; 1267 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int, 1268 p->if_stack_array_size); 1269 } 1270} 1271 1272static brw_inst * 1273pop_if_stack(struct brw_codegen *p) 1274{ 1275 p->if_stack_depth--; 1276 return &p->store[p->if_stack[p->if_stack_depth]]; 1277} 1278 1279static void 1280push_loop_stack(struct brw_codegen *p, brw_inst *inst) 1281{ 1282 if (p->loop_stack_array_size < p->loop_stack_depth) { 1283 p->loop_stack_array_size *= 2; 1284 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int, 1285 p->loop_stack_array_size); 1286 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int, 1287 p->loop_stack_array_size); 1288 } 1289 1290 p->loop_stack[p->loop_stack_depth] = inst - p->store; 1291 p->loop_stack_depth++; 1292 p->if_depth_in_loop[p->loop_stack_depth] = 0; 1293} 1294 1295static brw_inst * 1296get_inner_do_insn(struct brw_codegen *p) 1297{ 1298 return &p->store[p->loop_stack[p->loop_stack_depth - 1]]; 1299} 1300 1301/* EU takes the value from the flag register and pushes it onto some 1302 * sort of a stack (presumably merging with any flag value already on 1303 * the stack). Within an if block, the flags at the top of the stack 1304 * control execution on each channel of the unit, eg. on each of the 1305 * 16 pixel values in our wm programs. 1306 * 1307 * When the matching 'else' instruction is reached (presumably by 1308 * countdown of the instruction count patched in by our ELSE/ENDIF 1309 * functions), the relevant flags are inverted. 1310 * 1311 * When the matching 'endif' instruction is reached, the flags are 1312 * popped off. If the stack is now empty, normal execution resumes. 1313 */ 1314brw_inst * 1315brw_IF(struct brw_codegen *p, unsigned execute_size) 1316{ 1317 const struct brw_device_info *devinfo = p->devinfo; 1318 brw_inst *insn; 1319 1320 insn = next_insn(p, BRW_OPCODE_IF); 1321 1322 /* Override the defaults for this instruction: 1323 */ 1324 if (devinfo->gen < 6) { 1325 brw_set_dest(p, insn, brw_ip_reg()); 1326 brw_set_src0(p, insn, brw_ip_reg()); 1327 brw_set_src1(p, insn, brw_imm_d(0x0)); 1328 } else if (devinfo->gen == 6) { 1329 brw_set_dest(p, insn, brw_imm_w(0)); 1330 brw_inst_set_gen6_jump_count(devinfo, insn, 0); 1331 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); 1332 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); 1333 } else if (devinfo->gen == 7) { 1334 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); 1335 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); 1336 brw_set_src1(p, insn, brw_imm_w(0)); 1337 brw_inst_set_jip(devinfo, insn, 0); 1338 brw_inst_set_uip(devinfo, insn, 0); 1339 } else { 1340 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D))); 1341 brw_set_src0(p, insn, brw_imm_d(0)); 1342 brw_inst_set_jip(devinfo, insn, 0); 1343 brw_inst_set_uip(devinfo, insn, 0); 1344 } 1345 1346 brw_inst_set_exec_size(devinfo, insn, execute_size); 1347 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); 1348 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL); 1349 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE); 1350 if (!p->single_program_flow && devinfo->gen < 6) 1351 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH); 1352 1353 push_if_stack(p, insn); 1354 p->if_depth_in_loop[p->loop_stack_depth]++; 1355 return insn; 1356} 1357 1358/* This function is only used for gen6-style IF instructions with an 1359 * embedded comparison (conditional modifier). It is not used on gen7. 1360 */ 1361brw_inst * 1362gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional, 1363 struct brw_reg src0, struct brw_reg src1) 1364{ 1365 const struct brw_device_info *devinfo = p->devinfo; 1366 brw_inst *insn; 1367 1368 insn = next_insn(p, BRW_OPCODE_IF); 1369 1370 brw_set_dest(p, insn, brw_imm_w(0)); 1371 brw_inst_set_exec_size(devinfo, insn, p->compressed ? BRW_EXECUTE_16 1372 : BRW_EXECUTE_8); 1373 brw_inst_set_gen6_jump_count(devinfo, insn, 0); 1374 brw_set_src0(p, insn, src0); 1375 brw_set_src1(p, insn, src1); 1376 1377 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE); 1378 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE); 1379 brw_inst_set_cond_modifier(devinfo, insn, conditional); 1380 1381 push_if_stack(p, insn); 1382 return insn; 1383} 1384 1385/** 1386 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs. 1387 */ 1388static void 1389convert_IF_ELSE_to_ADD(struct brw_codegen *p, 1390 brw_inst *if_inst, brw_inst *else_inst) 1391{ 1392 const struct brw_device_info *devinfo = p->devinfo; 1393 1394 /* The next instruction (where the ENDIF would be, if it existed) */ 1395 brw_inst *next_inst = &p->store[p->nr_insn]; 1396 1397 assert(p->single_program_flow); 1398 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF); 1399 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE); 1400 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1); 1401 1402 /* Convert IF to an ADD instruction that moves the instruction pointer 1403 * to the first instruction of the ELSE block. If there is no ELSE 1404 * block, point to where ENDIF would be. Reverse the predicate. 1405 * 1406 * There's no need to execute an ENDIF since we don't need to do any 1407 * stack operations, and if we're currently executing, we just want to 1408 * continue normally. 1409 */ 1410 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD); 1411 brw_inst_set_pred_inv(devinfo, if_inst, true); 1412 1413 if (else_inst != NULL) { 1414 /* Convert ELSE to an ADD instruction that points where the ENDIF 1415 * would be. 1416 */ 1417 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD); 1418 1419 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16); 1420 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16); 1421 } else { 1422 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16); 1423 } 1424} 1425 1426/** 1427 * Patch IF and ELSE instructions with appropriate jump targets. 1428 */ 1429static void 1430patch_IF_ELSE(struct brw_codegen *p, 1431 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst) 1432{ 1433 const struct brw_device_info *devinfo = p->devinfo; 1434 1435 /* We shouldn't be patching IF and ELSE instructions in single program flow 1436 * mode when gen < 6, because in single program flow mode on those 1437 * platforms, we convert flow control instructions to conditional ADDs that 1438 * operate on IP (see brw_ENDIF). 1439 * 1440 * However, on Gen6, writing to IP doesn't work in single program flow mode 1441 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may 1442 * not be updated by non-flow control instructions."). And on later 1443 * platforms, there is no significant benefit to converting control flow 1444 * instructions to conditional ADDs. So we do patch IF and ELSE 1445 * instructions in single program flow mode on those platforms. 1446 */ 1447 if (devinfo->gen < 6) 1448 assert(!p->single_program_flow); 1449 1450 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF); 1451 assert(endif_inst != NULL); 1452 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE); 1453 1454 unsigned br = brw_jump_scale(devinfo); 1455 1456 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF); 1457 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst)); 1458 1459 if (else_inst == NULL) { 1460 /* Patch IF -> ENDIF */ 1461 if (devinfo->gen < 6) { 1462 /* Turn it into an IFF, which means no mask stack operations for 1463 * all-false and jumping past the ENDIF. 1464 */ 1465 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF); 1466 brw_inst_set_gen4_jump_count(devinfo, if_inst, 1467 br * (endif_inst - if_inst + 1)); 1468 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0); 1469 } else if (devinfo->gen == 6) { 1470 /* As of gen6, there is no IFF and IF must point to the ENDIF. */ 1471 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst)); 1472 } else { 1473 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst)); 1474 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst)); 1475 } 1476 } else { 1477 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst)); 1478 1479 /* Patch IF -> ELSE */ 1480 if (devinfo->gen < 6) { 1481 brw_inst_set_gen4_jump_count(devinfo, if_inst, 1482 br * (else_inst - if_inst)); 1483 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0); 1484 } else if (devinfo->gen == 6) { 1485 brw_inst_set_gen6_jump_count(devinfo, if_inst, 1486 br * (else_inst - if_inst + 1)); 1487 } 1488 1489 /* Patch ELSE -> ENDIF */ 1490 if (devinfo->gen < 6) { 1491 /* BRW_OPCODE_ELSE pre-gen6 should point just past the 1492 * matching ENDIF. 1493 */ 1494 brw_inst_set_gen4_jump_count(devinfo, else_inst, 1495 br * (endif_inst - else_inst + 1)); 1496 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1); 1497 } else if (devinfo->gen == 6) { 1498 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */ 1499 brw_inst_set_gen6_jump_count(devinfo, else_inst, 1500 br * (endif_inst - else_inst)); 1501 } else { 1502 /* The IF instruction's JIP should point just past the ELSE */ 1503 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1)); 1504 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */ 1505 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst)); 1506 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst)); 1507 if (devinfo->gen >= 8) { 1508 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both 1509 * should point to ENDIF. 1510 */ 1511 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst)); 1512 } 1513 } 1514 } 1515} 1516 1517void 1518brw_ELSE(struct brw_codegen *p) 1519{ 1520 const struct brw_device_info *devinfo = p->devinfo; 1521 brw_inst *insn; 1522 1523 insn = next_insn(p, BRW_OPCODE_ELSE); 1524 1525 if (devinfo->gen < 6) { 1526 brw_set_dest(p, insn, brw_ip_reg()); 1527 brw_set_src0(p, insn, brw_ip_reg()); 1528 brw_set_src1(p, insn, brw_imm_d(0x0)); 1529 } else if (devinfo->gen == 6) { 1530 brw_set_dest(p, insn, brw_imm_w(0)); 1531 brw_inst_set_gen6_jump_count(devinfo, insn, 0); 1532 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1533 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1534 } else if (devinfo->gen == 7) { 1535 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1536 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1537 brw_set_src1(p, insn, brw_imm_w(0)); 1538 brw_inst_set_jip(devinfo, insn, 0); 1539 brw_inst_set_uip(devinfo, insn, 0); 1540 } else { 1541 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1542 brw_set_src0(p, insn, brw_imm_d(0)); 1543 brw_inst_set_jip(devinfo, insn, 0); 1544 brw_inst_set_uip(devinfo, insn, 0); 1545 } 1546 1547 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); 1548 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE); 1549 if (!p->single_program_flow && devinfo->gen < 6) 1550 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH); 1551 1552 push_if_stack(p, insn); 1553} 1554 1555void 1556brw_ENDIF(struct brw_codegen *p) 1557{ 1558 const struct brw_device_info *devinfo = p->devinfo; 1559 brw_inst *insn = NULL; 1560 brw_inst *else_inst = NULL; 1561 brw_inst *if_inst = NULL; 1562 brw_inst *tmp; 1563 bool emit_endif = true; 1564 1565 /* In single program flow mode, we can express IF and ELSE instructions 1566 * equivalently as ADD instructions that operate on IP. On platforms prior 1567 * to Gen6, flow control instructions cause an implied thread switch, so 1568 * this is a significant savings. 1569 * 1570 * However, on Gen6, writing to IP doesn't work in single program flow mode 1571 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may 1572 * not be updated by non-flow control instructions."). And on later 1573 * platforms, there is no significant benefit to converting control flow 1574 * instructions to conditional ADDs. So we only do this trick on Gen4 and 1575 * Gen5. 1576 */ 1577 if (devinfo->gen < 6 && p->single_program_flow) 1578 emit_endif = false; 1579 1580 /* 1581 * A single next_insn() may change the base address of instruction store 1582 * memory(p->store), so call it first before referencing the instruction 1583 * store pointer from an index 1584 */ 1585 if (emit_endif) 1586 insn = next_insn(p, BRW_OPCODE_ENDIF); 1587 1588 /* Pop the IF and (optional) ELSE instructions from the stack */ 1589 p->if_depth_in_loop[p->loop_stack_depth]--; 1590 tmp = pop_if_stack(p); 1591 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) { 1592 else_inst = tmp; 1593 tmp = pop_if_stack(p); 1594 } 1595 if_inst = tmp; 1596 1597 if (!emit_endif) { 1598 /* ENDIF is useless; don't bother emitting it. */ 1599 convert_IF_ELSE_to_ADD(p, if_inst, else_inst); 1600 return; 1601 } 1602 1603 if (devinfo->gen < 6) { 1604 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1605 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1606 brw_set_src1(p, insn, brw_imm_d(0x0)); 1607 } else if (devinfo->gen == 6) { 1608 brw_set_dest(p, insn, brw_imm_w(0)); 1609 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1610 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1611 } else if (devinfo->gen == 7) { 1612 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1613 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1614 brw_set_src1(p, insn, brw_imm_w(0)); 1615 } else { 1616 brw_set_src0(p, insn, brw_imm_d(0)); 1617 } 1618 1619 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); 1620 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE); 1621 if (devinfo->gen < 6) 1622 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH); 1623 1624 /* Also pop item off the stack in the endif instruction: */ 1625 if (devinfo->gen < 6) { 1626 brw_inst_set_gen4_jump_count(devinfo, insn, 0); 1627 brw_inst_set_gen4_pop_count(devinfo, insn, 1); 1628 } else if (devinfo->gen == 6) { 1629 brw_inst_set_gen6_jump_count(devinfo, insn, 2); 1630 } else { 1631 brw_inst_set_jip(devinfo, insn, 2); 1632 } 1633 patch_IF_ELSE(p, if_inst, else_inst, insn); 1634} 1635 1636brw_inst * 1637brw_BREAK(struct brw_codegen *p) 1638{ 1639 const struct brw_device_info *devinfo = p->devinfo; 1640 brw_inst *insn; 1641 1642 insn = next_insn(p, BRW_OPCODE_BREAK); 1643 if (devinfo->gen >= 8) { 1644 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1645 brw_set_src0(p, insn, brw_imm_d(0x0)); 1646 } else if (devinfo->gen >= 6) { 1647 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1648 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1649 brw_set_src1(p, insn, brw_imm_d(0x0)); 1650 } else { 1651 brw_set_dest(p, insn, brw_ip_reg()); 1652 brw_set_src0(p, insn, brw_ip_reg()); 1653 brw_set_src1(p, insn, brw_imm_d(0x0)); 1654 brw_inst_set_gen4_pop_count(devinfo, insn, 1655 p->if_depth_in_loop[p->loop_stack_depth]); 1656 } 1657 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); 1658 brw_inst_set_exec_size(devinfo, insn, p->compressed ? BRW_EXECUTE_16 1659 : BRW_EXECUTE_8); 1660 1661 return insn; 1662} 1663 1664brw_inst * 1665brw_CONT(struct brw_codegen *p) 1666{ 1667 const struct brw_device_info *devinfo = p->devinfo; 1668 brw_inst *insn; 1669 1670 insn = next_insn(p, BRW_OPCODE_CONTINUE); 1671 brw_set_dest(p, insn, brw_ip_reg()); 1672 if (devinfo->gen >= 8) { 1673 brw_set_src0(p, insn, brw_imm_d(0x0)); 1674 } else { 1675 brw_set_src0(p, insn, brw_ip_reg()); 1676 brw_set_src1(p, insn, brw_imm_d(0x0)); 1677 } 1678 1679 if (devinfo->gen < 6) { 1680 brw_inst_set_gen4_pop_count(devinfo, insn, 1681 p->if_depth_in_loop[p->loop_stack_depth]); 1682 } 1683 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); 1684 brw_inst_set_exec_size(devinfo, insn, p->compressed ? BRW_EXECUTE_16 1685 : BRW_EXECUTE_8); 1686 return insn; 1687} 1688 1689brw_inst * 1690gen6_HALT(struct brw_codegen *p) 1691{ 1692 const struct brw_device_info *devinfo = p->devinfo; 1693 brw_inst *insn; 1694 1695 insn = next_insn(p, BRW_OPCODE_HALT); 1696 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1697 if (devinfo->gen >= 8) { 1698 brw_set_src0(p, insn, brw_imm_d(0x0)); 1699 } else { 1700 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1701 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */ 1702 } 1703 1704 if (p->compressed) { 1705 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_16); 1706 } else { 1707 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); 1708 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_8); 1709 } 1710 return insn; 1711} 1712 1713/* DO/WHILE loop: 1714 * 1715 * The DO/WHILE is just an unterminated loop -- break or continue are 1716 * used for control within the loop. We have a few ways they can be 1717 * done. 1718 * 1719 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip, 1720 * jip and no DO instruction. 1721 * 1722 * For non-uniform control flow pre-gen6, there's a DO instruction to 1723 * push the mask, and a WHILE to jump back, and BREAK to get out and 1724 * pop the mask. 1725 * 1726 * For gen6, there's no more mask stack, so no need for DO. WHILE 1727 * just points back to the first instruction of the loop. 1728 */ 1729brw_inst * 1730brw_DO(struct brw_codegen *p, unsigned execute_size) 1731{ 1732 const struct brw_device_info *devinfo = p->devinfo; 1733 1734 if (devinfo->gen >= 6 || p->single_program_flow) { 1735 push_loop_stack(p, &p->store[p->nr_insn]); 1736 return &p->store[p->nr_insn]; 1737 } else { 1738 brw_inst *insn = next_insn(p, BRW_OPCODE_DO); 1739 1740 push_loop_stack(p, insn); 1741 1742 /* Override the defaults for this instruction: 1743 */ 1744 brw_set_dest(p, insn, brw_null_reg()); 1745 brw_set_src0(p, insn, brw_null_reg()); 1746 brw_set_src1(p, insn, brw_null_reg()); 1747 1748 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); 1749 brw_inst_set_exec_size(devinfo, insn, execute_size); 1750 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); 1751 1752 return insn; 1753 } 1754} 1755 1756/** 1757 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE 1758 * instruction here. 1759 * 1760 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop 1761 * nesting, since it can always just point to the end of the block/current loop. 1762 */ 1763static void 1764brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst) 1765{ 1766 const struct brw_device_info *devinfo = p->devinfo; 1767 brw_inst *do_inst = get_inner_do_insn(p); 1768 brw_inst *inst; 1769 unsigned br = brw_jump_scale(devinfo); 1770 1771 assert(devinfo->gen < 6); 1772 1773 for (inst = while_inst - 1; inst != do_inst; inst--) { 1774 /* If the jump count is != 0, that means that this instruction has already 1775 * been patched because it's part of a loop inside of the one we're 1776 * patching. 1777 */ 1778 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK && 1779 brw_inst_gen4_jump_count(devinfo, inst) == 0) { 1780 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1)); 1781 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE && 1782 brw_inst_gen4_jump_count(devinfo, inst) == 0) { 1783 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst)); 1784 } 1785 } 1786} 1787 1788brw_inst * 1789brw_WHILE(struct brw_codegen *p) 1790{ 1791 const struct brw_device_info *devinfo = p->devinfo; 1792 brw_inst *insn, *do_insn; 1793 unsigned br = brw_jump_scale(devinfo); 1794 1795 if (devinfo->gen >= 6) { 1796 insn = next_insn(p, BRW_OPCODE_WHILE); 1797 do_insn = get_inner_do_insn(p); 1798 1799 if (devinfo->gen >= 8) { 1800 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1801 brw_set_src0(p, insn, brw_imm_d(0)); 1802 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn)); 1803 } else if (devinfo->gen == 7) { 1804 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1805 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1806 brw_set_src1(p, insn, brw_imm_w(0)); 1807 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn)); 1808 } else { 1809 brw_set_dest(p, insn, brw_imm_w(0)); 1810 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn)); 1811 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1812 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 1813 } 1814 1815 brw_inst_set_exec_size(devinfo, insn, p->compressed ? BRW_EXECUTE_16 1816 : BRW_EXECUTE_8); 1817 } else { 1818 if (p->single_program_flow) { 1819 insn = next_insn(p, BRW_OPCODE_ADD); 1820 do_insn = get_inner_do_insn(p); 1821 1822 brw_set_dest(p, insn, brw_ip_reg()); 1823 brw_set_src0(p, insn, brw_ip_reg()); 1824 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16)); 1825 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1); 1826 } else { 1827 insn = next_insn(p, BRW_OPCODE_WHILE); 1828 do_insn = get_inner_do_insn(p); 1829 1830 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO); 1831 1832 brw_set_dest(p, insn, brw_ip_reg()); 1833 brw_set_src0(p, insn, brw_ip_reg()); 1834 brw_set_src1(p, insn, brw_imm_d(0)); 1835 1836 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn)); 1837 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1)); 1838 brw_inst_set_gen4_pop_count(devinfo, insn, 0); 1839 1840 brw_patch_break_cont(p, insn); 1841 } 1842 } 1843 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); 1844 1845 p->loop_stack_depth--; 1846 1847 return insn; 1848} 1849 1850/* FORWARD JUMPS: 1851 */ 1852void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx) 1853{ 1854 const struct brw_device_info *devinfo = p->devinfo; 1855 brw_inst *jmp_insn = &p->store[jmp_insn_idx]; 1856 unsigned jmpi = 1; 1857 1858 if (devinfo->gen >= 5) 1859 jmpi = 2; 1860 1861 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI); 1862 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE); 1863 1864 brw_inst_set_gen4_jump_count(devinfo, jmp_insn, 1865 jmpi * (p->nr_insn - jmp_insn_idx - 1)); 1866} 1867 1868/* To integrate with the above, it makes sense that the comparison 1869 * instruction should populate the flag register. It might be simpler 1870 * just to use the flag reg for most WM tasks? 1871 */ 1872void brw_CMP(struct brw_codegen *p, 1873 struct brw_reg dest, 1874 unsigned conditional, 1875 struct brw_reg src0, 1876 struct brw_reg src1) 1877{ 1878 const struct brw_device_info *devinfo = p->devinfo; 1879 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP); 1880 1881 brw_inst_set_cond_modifier(devinfo, insn, conditional); 1882 brw_set_dest(p, insn, dest); 1883 brw_set_src0(p, insn, src0); 1884 brw_set_src1(p, insn, src1); 1885 1886 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds 1887 * page says: 1888 * "Any CMP instruction with a null destination must use a {switch}." 1889 * 1890 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't 1891 * mentioned on their work-arounds pages. 1892 */ 1893 if (devinfo->gen == 7) { 1894 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE && 1895 dest.nr == BRW_ARF_NULL) { 1896 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH); 1897 } 1898 } 1899} 1900 1901/*********************************************************************** 1902 * Helpers for the various SEND message types: 1903 */ 1904 1905/** Extended math function, float[8]. 1906 */ 1907void gen4_math(struct brw_codegen *p, 1908 struct brw_reg dest, 1909 unsigned function, 1910 unsigned msg_reg_nr, 1911 struct brw_reg src, 1912 unsigned precision ) 1913{ 1914 const struct brw_device_info *devinfo = p->devinfo; 1915 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND); 1916 unsigned data_type; 1917 if (has_scalar_region(src)) { 1918 data_type = BRW_MATH_DATA_SCALAR; 1919 } else { 1920 data_type = BRW_MATH_DATA_VECTOR; 1921 } 1922 1923 assert(devinfo->gen < 6); 1924 1925 /* Example code doesn't set predicate_control for send 1926 * instructions. 1927 */ 1928 brw_inst_set_pred_control(devinfo, insn, 0); 1929 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr); 1930 1931 brw_set_dest(p, insn, dest); 1932 brw_set_src0(p, insn, src); 1933 brw_set_math_message(p, 1934 insn, 1935 function, 1936 src.type == BRW_REGISTER_TYPE_D, 1937 precision, 1938 data_type); 1939} 1940 1941void gen6_math(struct brw_codegen *p, 1942 struct brw_reg dest, 1943 unsigned function, 1944 struct brw_reg src0, 1945 struct brw_reg src1) 1946{ 1947 const struct brw_device_info *devinfo = p->devinfo; 1948 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH); 1949 1950 assert(devinfo->gen >= 6); 1951 1952 assert(dest.file == BRW_GENERAL_REGISTER_FILE || 1953 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE)); 1954 assert(src0.file == BRW_GENERAL_REGISTER_FILE || 1955 (devinfo->gen >= 8 && src0.file == BRW_IMMEDIATE_VALUE)); 1956 1957 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1); 1958 if (devinfo->gen == 6) { 1959 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1); 1960 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1); 1961 } 1962 1963 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT || 1964 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER || 1965 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) { 1966 assert(src0.type != BRW_REGISTER_TYPE_F); 1967 assert(src1.type != BRW_REGISTER_TYPE_F); 1968 assert(src1.file == BRW_GENERAL_REGISTER_FILE || 1969 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE)); 1970 } else { 1971 assert(src0.type == BRW_REGISTER_TYPE_F); 1972 assert(src1.type == BRW_REGISTER_TYPE_F); 1973 if (function == BRW_MATH_FUNCTION_POW) { 1974 assert(src1.file == BRW_GENERAL_REGISTER_FILE || 1975 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE)); 1976 } else { 1977 assert(src1.file == BRW_ARCHITECTURE_REGISTER_FILE && 1978 src1.nr == BRW_ARF_NULL); 1979 } 1980 } 1981 1982 /* Source modifiers are ignored for extended math instructions on Gen6. */ 1983 if (devinfo->gen == 6) { 1984 assert(!src0.negate); 1985 assert(!src0.abs); 1986 assert(!src1.negate); 1987 assert(!src1.abs); 1988 } 1989 1990 brw_inst_set_math_function(devinfo, insn, function); 1991 1992 brw_set_dest(p, insn, dest); 1993 brw_set_src0(p, insn, src0); 1994 brw_set_src1(p, insn, src1); 1995} 1996 1997 1998/** 1999 * Write a block of OWORDs (half a GRF each) from the scratch buffer, 2000 * using a constant offset per channel. 2001 * 2002 * The offset must be aligned to oword size (16 bytes). Used for 2003 * register spilling. 2004 */ 2005void brw_oword_block_write_scratch(struct brw_codegen *p, 2006 struct brw_reg mrf, 2007 int num_regs, 2008 unsigned offset) 2009{ 2010 const struct brw_device_info *devinfo = p->devinfo; 2011 uint32_t msg_control, msg_type; 2012 int mlen; 2013 2014 if (devinfo->gen >= 6) 2015 offset /= 16; 2016 2017 mrf = retype(mrf, BRW_REGISTER_TYPE_UD); 2018 2019 if (num_regs == 1) { 2020 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS; 2021 mlen = 2; 2022 } else { 2023 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS; 2024 mlen = 3; 2025 } 2026 2027 /* Set up the message header. This is g0, with g0.2 filled with 2028 * the offset. We don't want to leave our offset around in g0 or 2029 * it'll screw up texture samples, so set it up inside the message 2030 * reg. 2031 */ 2032 { 2033 brw_push_insn_state(p); 2034 brw_set_default_exec_size(p, BRW_EXECUTE_8); 2035 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 2036 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 2037 2038 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 2039 2040 /* set message header global offset field (reg 0, element 2) */ 2041 brw_MOV(p, 2042 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 2043 mrf.nr, 2044 2), BRW_REGISTER_TYPE_UD), 2045 brw_imm_ud(offset)); 2046 2047 brw_pop_insn_state(p); 2048 } 2049 2050 { 2051 struct brw_reg dest; 2052 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND); 2053 int send_commit_msg; 2054 struct brw_reg src_header = retype(brw_vec8_grf(0, 0), 2055 BRW_REGISTER_TYPE_UW); 2056 2057 if (brw_inst_qtr_control(devinfo, insn) != BRW_COMPRESSION_NONE) { 2058 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); 2059 src_header = vec16(src_header); 2060 } 2061 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE); 2062 if (devinfo->gen < 6) 2063 brw_inst_set_base_mrf(devinfo, insn, mrf.nr); 2064 2065 /* Until gen6, writes followed by reads from the same location 2066 * are not guaranteed to be ordered unless write_commit is set. 2067 * If set, then a no-op write is issued to the destination 2068 * register to set a dependency, and a read from the destination 2069 * can be used to ensure the ordering. 2070 * 2071 * For gen6, only writes between different threads need ordering 2072 * protection. Our use of DP writes is all about register 2073 * spilling within a thread. 2074 */ 2075 if (devinfo->gen >= 6) { 2076 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW); 2077 send_commit_msg = 0; 2078 } else { 2079 dest = src_header; 2080 send_commit_msg = 1; 2081 } 2082 2083 brw_set_dest(p, insn, dest); 2084 if (devinfo->gen >= 6) { 2085 brw_set_src0(p, insn, mrf); 2086 } else { 2087 brw_set_src0(p, insn, brw_null_reg()); 2088 } 2089 2090 if (devinfo->gen >= 6) 2091 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE; 2092 else 2093 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE; 2094 2095 brw_set_dp_write_message(p, 2096 insn, 2097 255, /* binding table index (255=stateless) */ 2098 msg_control, 2099 msg_type, 2100 mlen, 2101 true, /* header_present */ 2102 0, /* not a render target */ 2103 send_commit_msg, /* response_length */ 2104 0, /* eot */ 2105 send_commit_msg); 2106 } 2107} 2108 2109 2110/** 2111 * Read a block of owords (half a GRF each) from the scratch buffer 2112 * using a constant index per channel. 2113 * 2114 * Offset must be aligned to oword size (16 bytes). Used for register 2115 * spilling. 2116 */ 2117void 2118brw_oword_block_read_scratch(struct brw_codegen *p, 2119 struct brw_reg dest, 2120 struct brw_reg mrf, 2121 int num_regs, 2122 unsigned offset) 2123{ 2124 const struct brw_device_info *devinfo = p->devinfo; 2125 uint32_t msg_control; 2126 int rlen; 2127 2128 if (devinfo->gen >= 6) 2129 offset /= 16; 2130 2131 if (p->devinfo->gen >= 7) { 2132 /* On gen 7 and above, we no longer have message registers and we can 2133 * send from any register we want. By using the destination register 2134 * for the message, we guarantee that the implied message write won't 2135 * accidentally overwrite anything. This has been a problem because 2136 * the MRF registers and source for the final FB write are both fixed 2137 * and may overlap. 2138 */ 2139 mrf = retype(dest, BRW_REGISTER_TYPE_UD); 2140 } else { 2141 mrf = retype(mrf, BRW_REGISTER_TYPE_UD); 2142 } 2143 dest = retype(dest, BRW_REGISTER_TYPE_UW); 2144 2145 if (num_regs == 1) { 2146 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS; 2147 rlen = 1; 2148 } else { 2149 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS; 2150 rlen = 2; 2151 } 2152 2153 { 2154 brw_push_insn_state(p); 2155 brw_set_default_exec_size(p, BRW_EXECUTE_8); 2156 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 2157 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 2158 2159 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 2160 2161 /* set message header global offset field (reg 0, element 2) */ 2162 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset)); 2163 2164 brw_pop_insn_state(p); 2165 } 2166 2167 { 2168 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND); 2169 2170 assert(brw_inst_pred_control(devinfo, insn) == 0); 2171 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); 2172 2173 brw_set_dest(p, insn, dest); /* UW? */ 2174 if (devinfo->gen >= 6) { 2175 brw_set_src0(p, insn, mrf); 2176 } else { 2177 brw_set_src0(p, insn, brw_null_reg()); 2178 brw_inst_set_base_mrf(devinfo, insn, mrf.nr); 2179 } 2180 2181 brw_set_dp_read_message(p, 2182 insn, 2183 255, /* binding table index (255=stateless) */ 2184 msg_control, 2185 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */ 2186 BRW_DATAPORT_READ_TARGET_RENDER_CACHE, 2187 1, /* msg_length */ 2188 true, /* header_present */ 2189 rlen); 2190 } 2191} 2192 2193void 2194gen7_block_read_scratch(struct brw_codegen *p, 2195 struct brw_reg dest, 2196 int num_regs, 2197 unsigned offset) 2198{ 2199 const struct brw_device_info *devinfo = p->devinfo; 2200 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND); 2201 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE); 2202 2203 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); 2204 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW)); 2205 2206 /* The HW requires that the header is present; this is to get the g0.5 2207 * scratch offset. 2208 */ 2209 brw_set_src0(p, insn, brw_vec8_grf(0, 0)); 2210 2211 /* According to the docs, offset is "A 12-bit HWord offset into the memory 2212 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD 2213 * is 32 bytes, which happens to be the size of a register. 2214 */ 2215 offset /= REG_SIZE; 2216 assert(offset < (1 << 12)); 2217 2218 gen7_set_dp_scratch_message(p, insn, 2219 false, /* scratch read */ 2220 false, /* OWords */ 2221 false, /* invalidate after read */ 2222 num_regs, 2223 offset, 2224 1, /* mlen: just g0 */ 2225 num_regs, /* rlen */ 2226 true); /* header present */ 2227} 2228 2229/** 2230 * Read a float[4] vector from the data port Data Cache (const buffer). 2231 * Location (in buffer) should be a multiple of 16. 2232 * Used for fetching shader constants. 2233 */ 2234void brw_oword_block_read(struct brw_codegen *p, 2235 struct brw_reg dest, 2236 struct brw_reg mrf, 2237 uint32_t offset, 2238 uint32_t bind_table_index) 2239{ 2240 const struct brw_device_info *devinfo = p->devinfo; 2241 2242 /* On newer hardware, offset is in units of owords. */ 2243 if (devinfo->gen >= 6) 2244 offset /= 16; 2245 2246 mrf = retype(mrf, BRW_REGISTER_TYPE_UD); 2247 2248 brw_push_insn_state(p); 2249 brw_set_default_exec_size(p, BRW_EXECUTE_8); 2250 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 2251 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 2252 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 2253 2254 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 2255 2256 /* set message header global offset field (reg 0, element 2) */ 2257 brw_MOV(p, 2258 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 2259 mrf.nr, 2260 2), BRW_REGISTER_TYPE_UD), 2261 brw_imm_ud(offset)); 2262 2263 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND); 2264 2265 /* cast dest to a uword[8] vector */ 2266 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW); 2267 2268 brw_set_dest(p, insn, dest); 2269 if (devinfo->gen >= 6) { 2270 brw_set_src0(p, insn, mrf); 2271 } else { 2272 brw_set_src0(p, insn, brw_null_reg()); 2273 brw_inst_set_base_mrf(devinfo, insn, mrf.nr); 2274 } 2275 2276 brw_set_dp_read_message(p, 2277 insn, 2278 bind_table_index, 2279 BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW, 2280 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, 2281 BRW_DATAPORT_READ_TARGET_DATA_CACHE, 2282 1, /* msg_length */ 2283 true, /* header_present */ 2284 1); /* response_length (1 reg, 2 owords!) */ 2285 2286 brw_pop_insn_state(p); 2287} 2288 2289 2290void brw_fb_WRITE(struct brw_codegen *p, 2291 int dispatch_width, 2292 struct brw_reg payload, 2293 struct brw_reg implied_header, 2294 unsigned msg_control, 2295 unsigned binding_table_index, 2296 unsigned msg_length, 2297 unsigned response_length, 2298 bool eot, 2299 bool last_render_target, 2300 bool header_present) 2301{ 2302 const struct brw_device_info *devinfo = p->devinfo; 2303 brw_inst *insn; 2304 unsigned msg_type; 2305 struct brw_reg dest, src0; 2306 2307 if (dispatch_width == 16) 2308 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW); 2309 else 2310 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW); 2311 2312 if (devinfo->gen >= 6) { 2313 insn = next_insn(p, BRW_OPCODE_SENDC); 2314 } else { 2315 insn = next_insn(p, BRW_OPCODE_SEND); 2316 } 2317 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); 2318 2319 if (devinfo->gen >= 6) { 2320 /* headerless version, just submit color payload */ 2321 src0 = payload; 2322 2323 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE; 2324 } else { 2325 assert(payload.file == BRW_MESSAGE_REGISTER_FILE); 2326 brw_inst_set_base_mrf(devinfo, insn, payload.nr); 2327 src0 = implied_header; 2328 2329 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE; 2330 } 2331 2332 brw_set_dest(p, insn, dest); 2333 brw_set_src0(p, insn, src0); 2334 brw_set_dp_write_message(p, 2335 insn, 2336 binding_table_index, 2337 msg_control, 2338 msg_type, 2339 msg_length, 2340 header_present, 2341 last_render_target, 2342 response_length, 2343 eot, 2344 0 /* send_commit_msg */); 2345} 2346 2347 2348/** 2349 * Texture sample instruction. 2350 * Note: the msg_type plus msg_length values determine exactly what kind 2351 * of sampling operation is performed. See volume 4, page 161 of docs. 2352 */ 2353void brw_SAMPLE(struct brw_codegen *p, 2354 struct brw_reg dest, 2355 unsigned msg_reg_nr, 2356 struct brw_reg src0, 2357 unsigned binding_table_index, 2358 unsigned sampler, 2359 unsigned msg_type, 2360 unsigned response_length, 2361 unsigned msg_length, 2362 unsigned header_present, 2363 unsigned simd_mode, 2364 unsigned return_format) 2365{ 2366 const struct brw_device_info *devinfo = p->devinfo; 2367 brw_inst *insn; 2368 2369 if (msg_reg_nr != -1) 2370 gen6_resolve_implied_move(p, &src0, msg_reg_nr); 2371 2372 insn = next_insn(p, BRW_OPCODE_SEND); 2373 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */ 2374 2375 /* From the 965 PRM (volume 4, part 1, section 14.2.41): 2376 * 2377 * "Instruction compression is not allowed for this instruction (that 2378 * is, send). The hardware behavior is undefined if this instruction is 2379 * set as compressed. However, compress control can be set to "SecHalf" 2380 * to affect the EMask generation." 2381 * 2382 * No similar wording is found in later PRMs, but there are examples 2383 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages 2384 * are allowed in SIMD16 mode and they could not work without SecHalf. For 2385 * these reasons, we allow BRW_COMPRESSION_2NDHALF here. 2386 */ 2387 if (brw_inst_qtr_control(devinfo, insn) != BRW_COMPRESSION_2NDHALF) 2388 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE); 2389 2390 if (devinfo->gen < 6) 2391 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr); 2392 2393 brw_set_dest(p, insn, dest); 2394 brw_set_src0(p, insn, src0); 2395 brw_set_sampler_message(p, insn, 2396 binding_table_index, 2397 sampler, 2398 msg_type, 2399 response_length, 2400 msg_length, 2401 header_present, 2402 simd_mode, 2403 return_format); 2404} 2405 2406/* Adjust the message header's sampler state pointer to 2407 * select the correct group of 16 samplers. 2408 */ 2409void brw_adjust_sampler_state_pointer(struct brw_codegen *p, 2410 struct brw_reg header, 2411 struct brw_reg sampler_index) 2412{ 2413 /* The "Sampler Index" field can only store values between 0 and 15. 2414 * However, we can add an offset to the "Sampler State Pointer" 2415 * field, effectively selecting a different set of 16 samplers. 2416 * 2417 * The "Sampler State Pointer" needs to be aligned to a 32-byte 2418 * offset, and each sampler state is only 16-bytes, so we can't 2419 * exclusively use the offset - we have to use both. 2420 */ 2421 2422 const struct brw_device_info *devinfo = p->devinfo; 2423 2424 if (sampler_index.file == BRW_IMMEDIATE_VALUE) { 2425 const int sampler_state_size = 16; /* 16 bytes */ 2426 uint32_t sampler = sampler_index.dw1.ud; 2427 2428 if (sampler >= 16) { 2429 assert(devinfo->is_haswell || devinfo->gen >= 8); 2430 brw_ADD(p, 2431 get_element_ud(header, 3), 2432 get_element_ud(brw_vec8_grf(0, 0), 3), 2433 brw_imm_ud(16 * (sampler / 16) * sampler_state_size)); 2434 } 2435 } else { 2436 /* Non-const sampler array indexing case */ 2437 if (devinfo->gen < 8 && !devinfo->is_haswell) { 2438 return; 2439 } 2440 2441 struct brw_reg temp = get_element_ud(header, 3); 2442 2443 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0)); 2444 brw_SHL(p, temp, temp, brw_imm_ud(4)); 2445 brw_ADD(p, 2446 get_element_ud(header, 3), 2447 get_element_ud(brw_vec8_grf(0, 0), 3), 2448 temp); 2449 } 2450} 2451 2452/* All these variables are pretty confusing - we might be better off 2453 * using bitmasks and macros for this, in the old style. Or perhaps 2454 * just having the caller instantiate the fields in dword3 itself. 2455 */ 2456void brw_urb_WRITE(struct brw_codegen *p, 2457 struct brw_reg dest, 2458 unsigned msg_reg_nr, 2459 struct brw_reg src0, 2460 enum brw_urb_write_flags flags, 2461 unsigned msg_length, 2462 unsigned response_length, 2463 unsigned offset, 2464 unsigned swizzle) 2465{ 2466 const struct brw_device_info *devinfo = p->devinfo; 2467 brw_inst *insn; 2468 2469 gen6_resolve_implied_move(p, &src0, msg_reg_nr); 2470 2471 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) { 2472 /* Enable Channel Masks in the URB_WRITE_HWORD message header */ 2473 brw_push_insn_state(p); 2474 brw_set_default_access_mode(p, BRW_ALIGN_1); 2475 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 2476 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5), 2477 BRW_REGISTER_TYPE_UD), 2478 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), 2479 brw_imm_ud(0xff00)); 2480 brw_pop_insn_state(p); 2481 } 2482 2483 insn = next_insn(p, BRW_OPCODE_SEND); 2484 2485 assert(msg_length < BRW_MAX_MRF); 2486 2487 brw_set_dest(p, insn, dest); 2488 brw_set_src0(p, insn, src0); 2489 brw_set_src1(p, insn, brw_imm_d(0)); 2490 2491 if (devinfo->gen < 6) 2492 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr); 2493 2494 brw_set_urb_message(p, 2495 insn, 2496 flags, 2497 msg_length, 2498 response_length, 2499 offset, 2500 swizzle); 2501} 2502 2503struct brw_inst * 2504brw_send_indirect_message(struct brw_codegen *p, 2505 unsigned sfid, 2506 struct brw_reg dst, 2507 struct brw_reg payload, 2508 struct brw_reg desc) 2509{ 2510 const struct brw_device_info *devinfo = p->devinfo; 2511 struct brw_inst *send, *setup; 2512 2513 assert(desc.type == BRW_REGISTER_TYPE_UD); 2514 2515 if (desc.file == BRW_IMMEDIATE_VALUE) { 2516 setup = send = next_insn(p, BRW_OPCODE_SEND); 2517 brw_set_src1(p, send, desc); 2518 2519 } else { 2520 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD); 2521 2522 brw_push_insn_state(p); 2523 brw_set_default_access_mode(p, BRW_ALIGN_1); 2524 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 2525 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 2526 2527 /* Load the indirect descriptor to an address register using OR so the 2528 * caller can specify additional descriptor bits with the usual 2529 * brw_set_*_message() helper functions. 2530 */ 2531 setup = brw_OR(p, addr, desc, brw_imm_ud(0)); 2532 2533 brw_pop_insn_state(p); 2534 2535 send = next_insn(p, BRW_OPCODE_SEND); 2536 brw_set_src1(p, send, addr); 2537 } 2538 2539 brw_set_dest(p, send, dst); 2540 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD)); 2541 brw_inst_set_sfid(devinfo, send, sfid); 2542 2543 return setup; 2544} 2545 2546static struct brw_inst * 2547brw_send_indirect_surface_message(struct brw_codegen *p, 2548 unsigned sfid, 2549 struct brw_reg dst, 2550 struct brw_reg payload, 2551 struct brw_reg surface, 2552 unsigned message_len, 2553 unsigned response_len, 2554 bool header_present) 2555{ 2556 const struct brw_device_info *devinfo = p->devinfo; 2557 struct brw_inst *insn; 2558 2559 if (surface.file != BRW_IMMEDIATE_VALUE) { 2560 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD); 2561 2562 brw_push_insn_state(p); 2563 brw_set_default_access_mode(p, BRW_ALIGN_1); 2564 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 2565 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 2566 2567 /* Mask out invalid bits from the surface index to avoid hangs e.g. when 2568 * some surface array is accessed out of bounds. 2569 */ 2570 insn = brw_AND(p, addr, 2571 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)), 2572 BRW_GET_SWZ(surface.dw1.bits.swizzle, 0)), 2573 brw_imm_ud(0xff)); 2574 2575 brw_pop_insn_state(p); 2576 2577 surface = addr; 2578 } 2579 2580 insn = brw_send_indirect_message(p, sfid, dst, payload, surface); 2581 brw_inst_set_mlen(devinfo, insn, message_len); 2582 brw_inst_set_rlen(devinfo, insn, response_len); 2583 brw_inst_set_header_present(devinfo, insn, header_present); 2584 2585 return insn; 2586} 2587 2588static int 2589brw_find_next_block_end(struct brw_codegen *p, int start_offset) 2590{ 2591 int offset; 2592 void *store = p->store; 2593 const struct brw_device_info *devinfo = p->devinfo; 2594 2595 for (offset = next_offset(devinfo, store, start_offset); 2596 offset < p->next_insn_offset; 2597 offset = next_offset(devinfo, store, offset)) { 2598 brw_inst *insn = store + offset; 2599 2600 switch (brw_inst_opcode(devinfo, insn)) { 2601 case BRW_OPCODE_ENDIF: 2602 case BRW_OPCODE_ELSE: 2603 case BRW_OPCODE_WHILE: 2604 case BRW_OPCODE_HALT: 2605 return offset; 2606 } 2607 } 2608 2609 return 0; 2610} 2611 2612/* There is no DO instruction on gen6, so to find the end of the loop 2613 * we have to see if the loop is jumping back before our start 2614 * instruction. 2615 */ 2616static int 2617brw_find_loop_end(struct brw_codegen *p, int start_offset) 2618{ 2619 const struct brw_device_info *devinfo = p->devinfo; 2620 int offset; 2621 int scale = 16 / brw_jump_scale(devinfo); 2622 void *store = p->store; 2623 2624 assert(devinfo->gen >= 6); 2625 2626 /* Always start after the instruction (such as a WHILE) we're trying to fix 2627 * up. 2628 */ 2629 for (offset = next_offset(devinfo, store, start_offset); 2630 offset < p->next_insn_offset; 2631 offset = next_offset(devinfo, store, offset)) { 2632 brw_inst *insn = store + offset; 2633 2634 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) { 2635 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn) 2636 : brw_inst_jip(devinfo, insn); 2637 if (offset + jip * scale <= start_offset) 2638 return offset; 2639 } 2640 } 2641 assert(!"not reached"); 2642 return start_offset; 2643} 2644 2645/* After program generation, go back and update the UIP and JIP of 2646 * BREAK, CONT, and HALT instructions to their correct locations. 2647 */ 2648void 2649brw_set_uip_jip(struct brw_codegen *p) 2650{ 2651 const struct brw_device_info *devinfo = p->devinfo; 2652 int offset; 2653 int br = brw_jump_scale(devinfo); 2654 int scale = 16 / br; 2655 void *store = p->store; 2656 2657 if (devinfo->gen < 6) 2658 return; 2659 2660 for (offset = 0; offset < p->next_insn_offset; 2661 offset = next_offset(devinfo, store, offset)) { 2662 brw_inst *insn = store + offset; 2663 2664 if (brw_inst_cmpt_control(devinfo, insn)) { 2665 /* Fixups for compacted BREAK/CONTINUE not supported yet. */ 2666 assert(brw_inst_opcode(devinfo, insn) != BRW_OPCODE_BREAK && 2667 brw_inst_opcode(devinfo, insn) != BRW_OPCODE_CONTINUE && 2668 brw_inst_opcode(devinfo, insn) != BRW_OPCODE_HALT); 2669 continue; 2670 } 2671 2672 int block_end_offset = brw_find_next_block_end(p, offset); 2673 switch (brw_inst_opcode(devinfo, insn)) { 2674 case BRW_OPCODE_BREAK: 2675 assert(block_end_offset != 0); 2676 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale); 2677 /* Gen7 UIP points to WHILE; Gen6 points just after it */ 2678 brw_inst_set_uip(devinfo, insn, 2679 (brw_find_loop_end(p, offset) - offset + 2680 (devinfo->gen == 6 ? 16 : 0)) / scale); 2681 break; 2682 case BRW_OPCODE_CONTINUE: 2683 assert(block_end_offset != 0); 2684 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale); 2685 brw_inst_set_uip(devinfo, insn, 2686 (brw_find_loop_end(p, offset) - offset) / scale); 2687 2688 assert(brw_inst_uip(devinfo, insn) != 0); 2689 assert(brw_inst_jip(devinfo, insn) != 0); 2690 break; 2691 2692 case BRW_OPCODE_ENDIF: { 2693 int32_t jump = (block_end_offset == 0) ? 2694 1 * br : (block_end_offset - offset) / scale; 2695 if (devinfo->gen >= 7) 2696 brw_inst_set_jip(devinfo, insn, jump); 2697 else 2698 brw_inst_set_gen6_jump_count(devinfo, insn, jump); 2699 break; 2700 } 2701 2702 case BRW_OPCODE_HALT: 2703 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19): 2704 * 2705 * "In case of the halt instruction not inside any conditional 2706 * code block, the value of <JIP> and <UIP> should be the 2707 * same. In case of the halt instruction inside conditional code 2708 * block, the <UIP> should be the end of the program, and the 2709 * <JIP> should be end of the most inner conditional code block." 2710 * 2711 * The uip will have already been set by whoever set up the 2712 * instruction. 2713 */ 2714 if (block_end_offset == 0) { 2715 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn)); 2716 } else { 2717 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale); 2718 } 2719 assert(brw_inst_uip(devinfo, insn) != 0); 2720 assert(brw_inst_jip(devinfo, insn) != 0); 2721 break; 2722 } 2723 } 2724} 2725 2726void brw_ff_sync(struct brw_codegen *p, 2727 struct brw_reg dest, 2728 unsigned msg_reg_nr, 2729 struct brw_reg src0, 2730 bool allocate, 2731 unsigned response_length, 2732 bool eot) 2733{ 2734 const struct brw_device_info *devinfo = p->devinfo; 2735 brw_inst *insn; 2736 2737 gen6_resolve_implied_move(p, &src0, msg_reg_nr); 2738 2739 insn = next_insn(p, BRW_OPCODE_SEND); 2740 brw_set_dest(p, insn, dest); 2741 brw_set_src0(p, insn, src0); 2742 brw_set_src1(p, insn, brw_imm_d(0)); 2743 2744 if (devinfo->gen < 6) 2745 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr); 2746 2747 brw_set_ff_sync_message(p, 2748 insn, 2749 allocate, 2750 response_length, 2751 eot); 2752} 2753 2754/** 2755 * Emit the SEND instruction necessary to generate stream output data on Gen6 2756 * (for transform feedback). 2757 * 2758 * If send_commit_msg is true, this is the last piece of stream output data 2759 * from this thread, so send the data as a committed write. According to the 2760 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1): 2761 * 2762 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all 2763 * writes are complete by sending the final write as a committed write." 2764 */ 2765void 2766brw_svb_write(struct brw_codegen *p, 2767 struct brw_reg dest, 2768 unsigned msg_reg_nr, 2769 struct brw_reg src0, 2770 unsigned binding_table_index, 2771 bool send_commit_msg) 2772{ 2773 brw_inst *insn; 2774 2775 gen6_resolve_implied_move(p, &src0, msg_reg_nr); 2776 2777 insn = next_insn(p, BRW_OPCODE_SEND); 2778 brw_set_dest(p, insn, dest); 2779 brw_set_src0(p, insn, src0); 2780 brw_set_src1(p, insn, brw_imm_d(0)); 2781 brw_set_dp_write_message(p, insn, 2782 binding_table_index, 2783 0, /* msg_control: ignored */ 2784 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE, 2785 1, /* msg_length */ 2786 true, /* header_present */ 2787 0, /* last_render_target: ignored */ 2788 send_commit_msg, /* response_length */ 2789 0, /* end_of_thread */ 2790 send_commit_msg); /* send_commit_msg */ 2791} 2792 2793static unsigned 2794brw_surface_payload_size(struct brw_codegen *p, 2795 unsigned num_channels, 2796 bool has_simd4x2, 2797 bool has_simd16) 2798{ 2799 if (has_simd4x2 && brw_inst_access_mode(p->devinfo, p->current) == BRW_ALIGN_16) 2800 return 1; 2801 else if (has_simd16 && p->compressed) 2802 return 2 * num_channels; 2803 else 2804 return num_channels; 2805} 2806 2807static void 2808brw_set_dp_untyped_atomic_message(struct brw_codegen *p, 2809 brw_inst *insn, 2810 unsigned atomic_op, 2811 bool response_expected) 2812{ 2813 const struct brw_device_info *devinfo = p->devinfo; 2814 unsigned msg_control = 2815 atomic_op | /* Atomic Operation Type: BRW_AOP_* */ 2816 (response_expected ? 1 << 5 : 0); /* Return data expected */ 2817 2818 if (devinfo->gen >= 8 || devinfo->is_haswell) { 2819 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) { 2820 if (!p->compressed) 2821 msg_control |= 1 << 4; /* SIMD8 mode */ 2822 2823 brw_inst_set_dp_msg_type(devinfo, insn, 2824 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP); 2825 } else { 2826 brw_inst_set_dp_msg_type(devinfo, insn, 2827 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2); 2828 } 2829 } else { 2830 brw_inst_set_dp_msg_type(devinfo, insn, 2831 GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP); 2832 2833 if (!p->compressed) 2834 msg_control |= 1 << 4; /* SIMD8 mode */ 2835 } 2836 2837 brw_inst_set_dp_msg_control(devinfo, insn, msg_control); 2838} 2839 2840void 2841brw_untyped_atomic(struct brw_codegen *p, 2842 struct brw_reg dst, 2843 struct brw_reg payload, 2844 struct brw_reg surface, 2845 unsigned atomic_op, 2846 unsigned msg_length, 2847 bool response_expected) 2848{ 2849 const struct brw_device_info *devinfo = p->devinfo; 2850 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ? 2851 HSW_SFID_DATAPORT_DATA_CACHE_1 : 2852 GEN7_SFID_DATAPORT_DATA_CACHE); 2853 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1; 2854 /* Mask out unused components -- This is especially important in Align16 2855 * mode on generations that don't have native support for SIMD4x2 atomics, 2856 * because unused but enabled components will cause the dataport to perform 2857 * additional atomic operations on the addresses that happen to be in the 2858 * uninitialized Y, Z and W coordinates of the payload. 2859 */ 2860 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X; 2861 struct brw_inst *insn = brw_send_indirect_surface_message( 2862 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length, 2863 brw_surface_payload_size(p, response_expected, 2864 devinfo->gen >= 8 || devinfo->is_haswell, true), 2865 align1); 2866 2867 brw_set_dp_untyped_atomic_message( 2868 p, insn, atomic_op, response_expected); 2869} 2870 2871static void 2872brw_set_dp_untyped_surface_read_message(struct brw_codegen *p, 2873 struct brw_inst *insn, 2874 unsigned num_channels) 2875{ 2876 const struct brw_device_info *devinfo = p->devinfo; 2877 /* Set mask of 32-bit channels to drop. */ 2878 unsigned msg_control = 0xf & (0xf << num_channels); 2879 2880 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) { 2881 if (p->compressed) 2882 msg_control |= 1 << 4; /* SIMD16 mode */ 2883 else 2884 msg_control |= 2 << 4; /* SIMD8 mode */ 2885 } 2886 2887 brw_inst_set_dp_msg_type(devinfo, insn, 2888 (devinfo->gen >= 8 || devinfo->is_haswell ? 2889 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ : 2890 GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ)); 2891 brw_inst_set_dp_msg_control(devinfo, insn, msg_control); 2892} 2893 2894void 2895brw_untyped_surface_read(struct brw_codegen *p, 2896 struct brw_reg dst, 2897 struct brw_reg payload, 2898 struct brw_reg surface, 2899 unsigned msg_length, 2900 unsigned num_channels) 2901{ 2902 const struct brw_device_info *devinfo = p->devinfo; 2903 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ? 2904 HSW_SFID_DATAPORT_DATA_CACHE_1 : 2905 GEN7_SFID_DATAPORT_DATA_CACHE); 2906 const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1); 2907 struct brw_inst *insn = brw_send_indirect_surface_message( 2908 p, sfid, dst, payload, surface, msg_length, 2909 brw_surface_payload_size(p, num_channels, true, true), 2910 align1); 2911 2912 brw_set_dp_untyped_surface_read_message( 2913 p, insn, num_channels); 2914} 2915 2916static void 2917brw_set_dp_untyped_surface_write_message(struct brw_codegen *p, 2918 struct brw_inst *insn, 2919 unsigned num_channels) 2920{ 2921 const struct brw_device_info *devinfo = p->devinfo; 2922 /* Set mask of 32-bit channels to drop. */ 2923 unsigned msg_control = 0xf & (0xf << num_channels); 2924 2925 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) { 2926 if (p->compressed) 2927 msg_control |= 1 << 4; /* SIMD16 mode */ 2928 else 2929 msg_control |= 2 << 4; /* SIMD8 mode */ 2930 } else { 2931 if (devinfo->gen >= 8 || devinfo->is_haswell) 2932 msg_control |= 0 << 4; /* SIMD4x2 mode */ 2933 else 2934 msg_control |= 2 << 4; /* SIMD8 mode */ 2935 } 2936 2937 brw_inst_set_dp_msg_type(devinfo, insn, 2938 devinfo->gen >= 8 || devinfo->is_haswell ? 2939 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE : 2940 GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE); 2941 brw_inst_set_dp_msg_control(devinfo, insn, msg_control); 2942} 2943 2944void 2945brw_untyped_surface_write(struct brw_codegen *p, 2946 struct brw_reg payload, 2947 struct brw_reg surface, 2948 unsigned msg_length, 2949 unsigned num_channels) 2950{ 2951 const struct brw_device_info *devinfo = p->devinfo; 2952 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ? 2953 HSW_SFID_DATAPORT_DATA_CACHE_1 : 2954 GEN7_SFID_DATAPORT_DATA_CACHE); 2955 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1; 2956 /* Mask out unused components -- See comment in brw_untyped_atomic(). */ 2957 const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ? 2958 WRITEMASK_X : WRITEMASK_XYZW; 2959 struct brw_inst *insn = brw_send_indirect_surface_message( 2960 p, sfid, brw_writemask(brw_null_reg(), mask), 2961 payload, surface, msg_length, 0, align1); 2962 2963 brw_set_dp_untyped_surface_write_message( 2964 p, insn, num_channels); 2965} 2966 2967static void 2968brw_set_dp_typed_atomic_message(struct brw_codegen *p, 2969 struct brw_inst *insn, 2970 unsigned atomic_op, 2971 bool response_expected) 2972{ 2973 const struct brw_device_info *devinfo = p->devinfo; 2974 unsigned msg_control = 2975 atomic_op | /* Atomic Operation Type: BRW_AOP_* */ 2976 (response_expected ? 1 << 5 : 0); /* Return data expected */ 2977 2978 if (devinfo->gen >= 8 || devinfo->is_haswell) { 2979 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) { 2980 if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q) 2981 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */ 2982 2983 brw_inst_set_dp_msg_type(devinfo, insn, 2984 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP); 2985 } else { 2986 brw_inst_set_dp_msg_type(devinfo, insn, 2987 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2); 2988 } 2989 2990 } else { 2991 brw_inst_set_dp_msg_type(devinfo, insn, 2992 GEN7_DATAPORT_RC_TYPED_ATOMIC_OP); 2993 2994 if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q) 2995 msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */ 2996 } 2997 2998 brw_inst_set_dp_msg_control(devinfo, insn, msg_control); 2999} 3000 3001void 3002brw_typed_atomic(struct brw_codegen *p, 3003 struct brw_reg dst, 3004 struct brw_reg payload, 3005 struct brw_reg surface, 3006 unsigned atomic_op, 3007 unsigned msg_length, 3008 bool response_expected) { 3009 const struct brw_device_info *devinfo = p->devinfo; 3010 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ? 3011 HSW_SFID_DATAPORT_DATA_CACHE_1 : 3012 GEN6_SFID_DATAPORT_RENDER_CACHE); 3013 const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1); 3014 /* Mask out unused components -- See comment in brw_untyped_atomic(). */ 3015 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X; 3016 struct brw_inst *insn = brw_send_indirect_surface_message( 3017 p, sfid, brw_writemask(dst, mask), payload, surface, msg_length, 3018 brw_surface_payload_size(p, response_expected, 3019 devinfo->gen >= 8 || devinfo->is_haswell, false), 3020 true); 3021 3022 brw_set_dp_typed_atomic_message( 3023 p, insn, atomic_op, response_expected); 3024} 3025 3026static void 3027brw_set_dp_typed_surface_read_message(struct brw_codegen *p, 3028 struct brw_inst *insn, 3029 unsigned num_channels) 3030{ 3031 const struct brw_device_info *devinfo = p->devinfo; 3032 /* Set mask of unused channels. */ 3033 unsigned msg_control = 0xf & (0xf << num_channels); 3034 3035 if (devinfo->gen >= 8 || devinfo->is_haswell) { 3036 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) { 3037 if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q) 3038 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */ 3039 else 3040 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */ 3041 } 3042 3043 brw_inst_set_dp_msg_type(devinfo, insn, 3044 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ); 3045 } else { 3046 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) { 3047 if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q) 3048 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */ 3049 } 3050 3051 brw_inst_set_dp_msg_type(devinfo, insn, 3052 GEN7_DATAPORT_RC_TYPED_SURFACE_READ); 3053 } 3054 3055 brw_inst_set_dp_msg_control(devinfo, insn, msg_control); 3056} 3057 3058void 3059brw_typed_surface_read(struct brw_codegen *p, 3060 struct brw_reg dst, 3061 struct brw_reg payload, 3062 struct brw_reg surface, 3063 unsigned msg_length, 3064 unsigned num_channels) 3065{ 3066 const struct brw_device_info *devinfo = p->devinfo; 3067 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ? 3068 HSW_SFID_DATAPORT_DATA_CACHE_1 : 3069 GEN6_SFID_DATAPORT_RENDER_CACHE); 3070 struct brw_inst *insn = brw_send_indirect_surface_message( 3071 p, sfid, dst, payload, surface, msg_length, 3072 brw_surface_payload_size(p, num_channels, 3073 devinfo->gen >= 8 || devinfo->is_haswell, false), 3074 true); 3075 3076 brw_set_dp_typed_surface_read_message( 3077 p, insn, num_channels); 3078} 3079 3080static void 3081brw_set_dp_typed_surface_write_message(struct brw_codegen *p, 3082 struct brw_inst *insn, 3083 unsigned num_channels) 3084{ 3085 const struct brw_device_info *devinfo = p->devinfo; 3086 /* Set mask of unused channels. */ 3087 unsigned msg_control = 0xf & (0xf << num_channels); 3088 3089 if (devinfo->gen >= 8 || devinfo->is_haswell) { 3090 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) { 3091 if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q) 3092 msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */ 3093 else 3094 msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */ 3095 } 3096 3097 brw_inst_set_dp_msg_type(devinfo, insn, 3098 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE); 3099 3100 } else { 3101 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) { 3102 if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q) 3103 msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */ 3104 } 3105 3106 brw_inst_set_dp_msg_type(devinfo, insn, 3107 GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE); 3108 } 3109 3110 brw_inst_set_dp_msg_control(devinfo, insn, msg_control); 3111} 3112 3113void 3114brw_typed_surface_write(struct brw_codegen *p, 3115 struct brw_reg payload, 3116 struct brw_reg surface, 3117 unsigned msg_length, 3118 unsigned num_channels) 3119{ 3120 const struct brw_device_info *devinfo = p->devinfo; 3121 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ? 3122 HSW_SFID_DATAPORT_DATA_CACHE_1 : 3123 GEN6_SFID_DATAPORT_RENDER_CACHE); 3124 const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1); 3125 /* Mask out unused components -- See comment in brw_untyped_atomic(). */ 3126 const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ? 3127 WRITEMASK_X : WRITEMASK_XYZW); 3128 struct brw_inst *insn = brw_send_indirect_surface_message( 3129 p, sfid, brw_writemask(brw_null_reg(), mask), 3130 payload, surface, msg_length, 0, true); 3131 3132 brw_set_dp_typed_surface_write_message( 3133 p, insn, num_channels); 3134} 3135 3136static void 3137brw_set_memory_fence_message(struct brw_codegen *p, 3138 struct brw_inst *insn, 3139 enum brw_message_target sfid, 3140 bool commit_enable) 3141{ 3142 const struct brw_device_info *devinfo = p->devinfo; 3143 3144 brw_set_message_descriptor(p, insn, sfid, 3145 1 /* message length */, 3146 (commit_enable ? 1 : 0) /* response length */, 3147 true /* header present */, 3148 false); 3149 3150 switch (sfid) { 3151 case GEN6_SFID_DATAPORT_RENDER_CACHE: 3152 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE); 3153 break; 3154 case GEN7_SFID_DATAPORT_DATA_CACHE: 3155 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE); 3156 break; 3157 default: 3158 unreachable("Not reached"); 3159 } 3160 3161 if (commit_enable) 3162 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5); 3163} 3164 3165void 3166brw_memory_fence(struct brw_codegen *p, 3167 struct brw_reg dst) 3168{ 3169 const struct brw_device_info *devinfo = p->devinfo; 3170 const bool commit_enable = devinfo->gen == 7 && !devinfo->is_haswell; 3171 struct brw_inst *insn; 3172 3173 /* Set dst as destination for dependency tracking, the MEMORY_FENCE 3174 * message doesn't write anything back. 3175 */ 3176 insn = next_insn(p, BRW_OPCODE_SEND); 3177 brw_set_dest(p, insn, dst); 3178 brw_set_src0(p, insn, dst); 3179 brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE, 3180 commit_enable); 3181 3182 if (devinfo->gen == 7 && !devinfo->is_haswell) { 3183 /* IVB does typed surface access through the render cache, so we need to 3184 * flush it too. Use a different register so both flushes can be 3185 * pipelined by the hardware. 3186 */ 3187 insn = next_insn(p, BRW_OPCODE_SEND); 3188 brw_set_dest(p, insn, offset(dst, 1)); 3189 brw_set_src0(p, insn, offset(dst, 1)); 3190 brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE, 3191 commit_enable); 3192 3193 /* Now write the response of the second message into the response of the 3194 * first to trigger a pipeline stall -- This way future render and data 3195 * cache messages will be properly ordered with respect to past data and 3196 * render cache messages. 3197 */ 3198 brw_push_insn_state(p); 3199 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 3200 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 3201 brw_MOV(p, dst, offset(dst, 1)); 3202 brw_pop_insn_state(p); 3203 } 3204} 3205 3206void 3207brw_pixel_interpolator_query(struct brw_codegen *p, 3208 struct brw_reg dest, 3209 struct brw_reg mrf, 3210 bool noperspective, 3211 unsigned mode, 3212 unsigned data, 3213 unsigned msg_length, 3214 unsigned response_length) 3215{ 3216 const struct brw_device_info *devinfo = p->devinfo; 3217 struct brw_inst *insn = next_insn(p, BRW_OPCODE_SEND); 3218 3219 brw_set_dest(p, insn, dest); 3220 brw_set_src0(p, insn, mrf); 3221 brw_set_message_descriptor(p, insn, GEN7_SFID_PIXEL_INTERPOLATOR, 3222 msg_length, response_length, 3223 false /* header is never present for PI */, 3224 false); 3225 3226 brw_inst_set_pi_simd_mode( 3227 devinfo, insn, brw_inst_exec_size(devinfo, insn) == BRW_EXECUTE_16); 3228 brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */ 3229 brw_inst_set_pi_nopersp(devinfo, insn, noperspective); 3230 brw_inst_set_pi_message_type(devinfo, insn, mode); 3231 brw_inst_set_pi_message_data(devinfo, insn, data); 3232} 3233 3234void 3235brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst) 3236{ 3237 const struct brw_device_info *devinfo = p->devinfo; 3238 brw_inst *inst; 3239 3240 assert(devinfo->gen >= 7); 3241 3242 brw_push_insn_state(p); 3243 3244 if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) { 3245 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 3246 3247 if (devinfo->gen >= 8) { 3248 /* Getting the first active channel index is easy on Gen8: Just find 3249 * the first bit set in the mask register. The same register exists 3250 * on HSW already but it reads back as all ones when the current 3251 * instruction has execution masking disabled, so it's kind of 3252 * useless. 3253 */ 3254 inst = brw_FBL(p, vec1(dst), 3255 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)); 3256 3257 /* Quarter control has the effect of magically shifting the value of 3258 * this register. Make sure it's set to zero. 3259 */ 3260 brw_inst_set_qtr_control(devinfo, inst, GEN6_COMPRESSION_1Q); 3261 } else { 3262 const struct brw_reg flag = retype(brw_flag_reg(1, 0), 3263 BRW_REGISTER_TYPE_UD); 3264 3265 brw_MOV(p, flag, brw_imm_ud(0)); 3266 3267 /* Run a 16-wide instruction returning zero with execution masking 3268 * and a conditional modifier enabled in order to get the current 3269 * execution mask in f1.0. 3270 */ 3271 inst = brw_MOV(p, brw_null_reg(), brw_imm_ud(0)); 3272 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_16); 3273 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE); 3274 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z); 3275 brw_inst_set_flag_reg_nr(devinfo, inst, 1); 3276 3277 brw_FBL(p, vec1(dst), flag); 3278 } 3279 } else { 3280 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 3281 3282 if (devinfo->gen >= 8) { 3283 /* In SIMD4x2 mode the first active channel index is just the 3284 * negation of the first bit of the mask register. 3285 */ 3286 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X), 3287 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)), 3288 brw_imm_ud(1)); 3289 3290 } else { 3291 /* Overwrite the destination without and with execution masking to 3292 * find out which of the channels is active. 3293 */ 3294 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X), 3295 brw_imm_ud(1)); 3296 3297 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X), 3298 brw_imm_ud(0)); 3299 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE); 3300 } 3301 } 3302 3303 brw_pop_insn_state(p); 3304} 3305 3306void 3307brw_broadcast(struct brw_codegen *p, 3308 struct brw_reg dst, 3309 struct brw_reg src, 3310 struct brw_reg idx) 3311{ 3312 const struct brw_device_info *devinfo = p->devinfo; 3313 const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1; 3314 brw_inst *inst; 3315 3316 assert(src.file == BRW_GENERAL_REGISTER_FILE && 3317 src.address_mode == BRW_ADDRESS_DIRECT); 3318 3319 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) || 3320 idx.file == BRW_IMMEDIATE_VALUE) { 3321 /* Trivial, the source is already uniform or the index is a constant. 3322 * We will typically not get here if the optimizer is doing its job, but 3323 * asserting would be mean. 3324 */ 3325 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.dw1.ud : 0; 3326 brw_MOV(p, dst, 3327 (align1 ? stride(suboffset(src, i), 0, 1, 0) : 3328 stride(suboffset(src, 4 * i), 0, 4, 1))); 3329 } else { 3330 if (align1) { 3331 const struct brw_reg addr = 3332 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD); 3333 const unsigned offset = src.nr * REG_SIZE + src.subnr; 3334 /* Limit in bytes of the signed indirect addressing immediate. */ 3335 const unsigned limit = 512; 3336 3337 brw_push_insn_state(p); 3338 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 3339 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 3340 3341 /* Take into account the component size and horizontal stride. */ 3342 assert(src.vstride == src.hstride + src.width); 3343 brw_SHL(p, addr, vec1(idx), 3344 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) + 3345 src.hstride - 1)); 3346 3347 /* We can only address up to limit bytes using the indirect 3348 * addressing immediate, account for the difference if the source 3349 * register is above this limit. 3350 */ 3351 if (offset >= limit) 3352 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit)); 3353 3354 brw_pop_insn_state(p); 3355 3356 /* Use indirect addressing to fetch the specified component. */ 3357 brw_MOV(p, dst, 3358 retype(brw_vec1_indirect(addr.subnr, offset % limit), 3359 src.type)); 3360 } else { 3361 /* In SIMD4x2 mode the index can be either zero or one, replicate it 3362 * to all bits of a flag register, 3363 */ 3364 inst = brw_MOV(p, 3365 brw_null_reg(), 3366 stride(brw_swizzle1(idx, 0), 0, 4, 1)); 3367 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE); 3368 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ); 3369 brw_inst_set_flag_reg_nr(devinfo, inst, 1); 3370 3371 /* and use predicated SEL to pick the right channel. */ 3372 inst = brw_SEL(p, dst, 3373 stride(suboffset(src, 4), 0, 4, 1), 3374 stride(src, 0, 4, 1)); 3375 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL); 3376 brw_inst_set_flag_reg_nr(devinfo, inst, 1); 3377 } 3378 } 3379} 3380 3381/** 3382 * This instruction is generated as a single-channel align1 instruction by 3383 * both the VS and FS stages when using INTEL_DEBUG=shader_time. 3384 * 3385 * We can't use the typed atomic op in the FS because that has the execution 3386 * mask ANDed with the pixel mask, but we just want to write the one dword for 3387 * all the pixels. 3388 * 3389 * We don't use the SIMD4x2 atomic ops in the VS because want to just write 3390 * one u32. So we use the same untyped atomic write message as the pixel 3391 * shader. 3392 * 3393 * The untyped atomic operation requires a BUFFER surface type with RAW 3394 * format, and is only accessible through the legacy DATA_CACHE dataport 3395 * messages. 3396 */ 3397void brw_shader_time_add(struct brw_codegen *p, 3398 struct brw_reg payload, 3399 uint32_t surf_index) 3400{ 3401 const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ? 3402 HSW_SFID_DATAPORT_DATA_CACHE_1 : 3403 GEN7_SFID_DATAPORT_DATA_CACHE); 3404 assert(p->devinfo->gen >= 7); 3405 3406 brw_push_insn_state(p); 3407 brw_set_default_access_mode(p, BRW_ALIGN_1); 3408 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 3409 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 3410 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 3411 3412 /* We use brw_vec1_reg and unmasked because we want to increment the given 3413 * offset only once. 3414 */ 3415 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE, 3416 BRW_ARF_NULL, 0)); 3417 brw_set_src0(p, send, brw_vec1_reg(payload.file, 3418 payload.nr, 0)); 3419 brw_set_src1(p, send, brw_imm_ud(0)); 3420 brw_set_message_descriptor(p, send, sfid, 2, 0, false, false); 3421 brw_inst_set_binding_table_index(p->devinfo, send, surf_index); 3422 brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false); 3423 3424 brw_pop_insn_state(p); 3425} 3426 3427 3428/** 3429 * Emit the SEND message for a barrier 3430 */ 3431void 3432brw_barrier(struct brw_codegen *p, struct brw_reg src) 3433{ 3434 const struct brw_device_info *devinfo = p->devinfo; 3435 struct brw_inst *inst; 3436 3437 assert(devinfo->gen >= 7); 3438 3439 inst = next_insn(p, BRW_OPCODE_SEND); 3440 brw_set_dest(p, inst, brw_null_reg()); 3441 brw_set_src0(p, inst, src); 3442 brw_set_src1(p, inst, brw_null_reg()); 3443 3444 brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY, 3445 1 /* msg_length */, 3446 0 /* response_length */, 3447 false /* header_present */, 3448 false /* end_of_thread */); 3449 3450 brw_inst_set_gateway_notify(devinfo, inst, 1); 3451 brw_inst_set_gateway_subfuncid(devinfo, inst, 3452 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG); 3453 3454 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE); 3455} 3456 3457 3458/** 3459 * Emit the wait instruction for a barrier 3460 */ 3461void 3462brw_WAIT(struct brw_codegen *p) 3463{ 3464 const struct brw_device_info *devinfo = p->devinfo; 3465 struct brw_inst *insn; 3466 3467 struct brw_reg src = brw_notification_reg(); 3468 3469 insn = next_insn(p, BRW_OPCODE_WAIT); 3470 brw_set_dest(p, insn, src); 3471 brw_set_src0(p, insn, src); 3472 brw_set_src1(p, insn, brw_null_reg()); 3473 3474 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1); 3475 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); 3476} 3477