1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "compiler/glsl/ir.h" 25#include "brw_fs.h" 26#include "brw_fs_surface_builder.h" 27#include "brw_nir.h" 28#include "brw_program.h" 29 30using namespace brw; 31using namespace brw::surface_access; 32 33void 34fs_visitor::emit_nir_code() 35{ 36 /* emit the arrays used for inputs and outputs - load/store intrinsics will 37 * be converted to reads/writes of these arrays 38 */ 39 nir_setup_outputs(); 40 nir_setup_uniforms(); 41 nir_emit_system_values(); 42 43 /* get the main function and emit it */ 44 nir_foreach_function(function, nir) { 45 assert(strcmp(function->name, "main") == 0); 46 assert(function->impl); 47 nir_emit_impl(function->impl); 48 } 49} 50 51void 52fs_visitor::nir_setup_outputs() 53{ 54 if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT) 55 return; 56 57 nir_foreach_variable(var, &nir->outputs) { 58 const unsigned vec4s = 59 var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4) 60 : type_size_vec4(var->type); 61 fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * vec4s); 62 for (unsigned i = 0; i < vec4s; i++) { 63 if (outputs[var->data.driver_location + i].file == BAD_FILE) 64 outputs[var->data.driver_location + i] = offset(reg, bld, 4 * i); 65 } 66 } 67} 68 69void 70fs_visitor::nir_setup_uniforms() 71{ 72 if (dispatch_width != min_dispatch_width) 73 return; 74 75 uniforms = nir->num_uniforms / 4; 76} 77 78static bool 79emit_system_values_block(nir_block *block, fs_visitor *v) 80{ 81 fs_reg *reg; 82 83 nir_foreach_instr(instr, block) { 84 if (instr->type != nir_instr_type_intrinsic) 85 continue; 86 87 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 88 switch (intrin->intrinsic) { 89 case nir_intrinsic_load_vertex_id: 90 unreachable("should be lowered by lower_vertex_id()."); 91 92 case nir_intrinsic_load_vertex_id_zero_base: 93 assert(v->stage == MESA_SHADER_VERTEX); 94 reg = &v->nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE]; 95 if (reg->file == BAD_FILE) 96 *reg = *v->emit_vs_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE); 97 break; 98 99 case nir_intrinsic_load_base_vertex: 100 assert(v->stage == MESA_SHADER_VERTEX); 101 reg = &v->nir_system_values[SYSTEM_VALUE_BASE_VERTEX]; 102 if (reg->file == BAD_FILE) 103 *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_VERTEX); 104 break; 105 106 case nir_intrinsic_load_instance_id: 107 assert(v->stage == MESA_SHADER_VERTEX); 108 reg = &v->nir_system_values[SYSTEM_VALUE_INSTANCE_ID]; 109 if (reg->file == BAD_FILE) 110 *reg = *v->emit_vs_system_value(SYSTEM_VALUE_INSTANCE_ID); 111 break; 112 113 case nir_intrinsic_load_base_instance: 114 assert(v->stage == MESA_SHADER_VERTEX); 115 reg = &v->nir_system_values[SYSTEM_VALUE_BASE_INSTANCE]; 116 if (reg->file == BAD_FILE) 117 *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_INSTANCE); 118 break; 119 120 case nir_intrinsic_load_draw_id: 121 assert(v->stage == MESA_SHADER_VERTEX); 122 reg = &v->nir_system_values[SYSTEM_VALUE_DRAW_ID]; 123 if (reg->file == BAD_FILE) 124 *reg = *v->emit_vs_system_value(SYSTEM_VALUE_DRAW_ID); 125 break; 126 127 case nir_intrinsic_load_invocation_id: 128 if (v->stage == MESA_SHADER_TESS_CTRL) 129 break; 130 assert(v->stage == MESA_SHADER_GEOMETRY); 131 reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID]; 132 if (reg->file == BAD_FILE) { 133 const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL); 134 fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); 135 fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1); 136 abld.SHR(iid, g1, brw_imm_ud(27u)); 137 *reg = iid; 138 } 139 break; 140 141 case nir_intrinsic_load_sample_pos: 142 assert(v->stage == MESA_SHADER_FRAGMENT); 143 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS]; 144 if (reg->file == BAD_FILE) 145 *reg = *v->emit_samplepos_setup(); 146 break; 147 148 case nir_intrinsic_load_sample_id: 149 assert(v->stage == MESA_SHADER_FRAGMENT); 150 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID]; 151 if (reg->file == BAD_FILE) 152 *reg = *v->emit_sampleid_setup(); 153 break; 154 155 case nir_intrinsic_load_sample_mask_in: 156 assert(v->stage == MESA_SHADER_FRAGMENT); 157 assert(v->devinfo->gen >= 7); 158 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN]; 159 if (reg->file == BAD_FILE) 160 *reg = *v->emit_samplemaskin_setup(); 161 break; 162 163 case nir_intrinsic_load_work_group_id: 164 assert(v->stage == MESA_SHADER_COMPUTE); 165 reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID]; 166 if (reg->file == BAD_FILE) 167 *reg = *v->emit_cs_work_group_id_setup(); 168 break; 169 170 case nir_intrinsic_load_helper_invocation: 171 assert(v->stage == MESA_SHADER_FRAGMENT); 172 reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION]; 173 if (reg->file == BAD_FILE) { 174 const fs_builder abld = 175 v->bld.annotate("gl_HelperInvocation", NULL); 176 177 /* On Gen6+ (gl_HelperInvocation is only exposed on Gen7+) the 178 * pixel mask is in g1.7 of the thread payload. 179 * 180 * We move the per-channel pixel enable bit to the low bit of each 181 * channel by shifting the byte containing the pixel mask by the 182 * vector immediate 0x76543210UV. 183 * 184 * The region of <1,8,0> reads only 1 byte (the pixel masks for 185 * subspans 0 and 1) in SIMD8 and an additional byte (the pixel 186 * masks for 2 and 3) in SIMD16. 187 */ 188 fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1); 189 abld.SHR(shifted, 190 stride(byte_offset(retype(brw_vec1_grf(1, 0), 191 BRW_REGISTER_TYPE_UB), 28), 192 1, 8, 0), 193 brw_imm_v(0x76543210)); 194 195 /* A set bit in the pixel mask means the channel is enabled, but 196 * that is the opposite of gl_HelperInvocation so we need to invert 197 * the mask. 198 * 199 * The negate source-modifier bit of logical instructions on Gen8+ 200 * performs 1's complement negation, so we can use that instead of 201 * a NOT instruction. 202 */ 203 fs_reg inverted = negate(shifted); 204 if (v->devinfo->gen < 8) { 205 inverted = abld.vgrf(BRW_REGISTER_TYPE_UW); 206 abld.NOT(inverted, shifted); 207 } 208 209 /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing 210 * with 1 and negating. 211 */ 212 fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1); 213 abld.AND(anded, inverted, brw_imm_uw(1)); 214 215 fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1); 216 abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D))); 217 *reg = dst; 218 } 219 break; 220 221 default: 222 break; 223 } 224 } 225 226 return true; 227} 228 229void 230fs_visitor::nir_emit_system_values() 231{ 232 nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX); 233 for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) { 234 nir_system_values[i] = fs_reg(); 235 } 236 237 nir_foreach_function(function, nir) { 238 assert(strcmp(function->name, "main") == 0); 239 assert(function->impl); 240 nir_foreach_block(block, function->impl) { 241 emit_system_values_block(block, this); 242 } 243 } 244} 245 246void 247fs_visitor::nir_emit_impl(nir_function_impl *impl) 248{ 249 nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc); 250 for (unsigned i = 0; i < impl->reg_alloc; i++) { 251 nir_locals[i] = fs_reg(); 252 } 253 254 foreach_list_typed(nir_register, reg, node, &impl->registers) { 255 unsigned array_elems = 256 reg->num_array_elems == 0 ? 1 : reg->num_array_elems; 257 unsigned size = array_elems * reg->num_components; 258 const brw_reg_type reg_type = 259 reg->bit_size == 32 ? BRW_REGISTER_TYPE_F : BRW_REGISTER_TYPE_DF; 260 nir_locals[reg->index] = bld.vgrf(reg_type, size); 261 } 262 263 nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg, 264 impl->ssa_alloc); 265 266 nir_emit_cf_list(&impl->body); 267} 268 269void 270fs_visitor::nir_emit_cf_list(exec_list *list) 271{ 272 exec_list_validate(list); 273 foreach_list_typed(nir_cf_node, node, node, list) { 274 switch (node->type) { 275 case nir_cf_node_if: 276 nir_emit_if(nir_cf_node_as_if(node)); 277 break; 278 279 case nir_cf_node_loop: 280 nir_emit_loop(nir_cf_node_as_loop(node)); 281 break; 282 283 case nir_cf_node_block: 284 nir_emit_block(nir_cf_node_as_block(node)); 285 break; 286 287 default: 288 unreachable("Invalid CFG node block"); 289 } 290 } 291} 292 293void 294fs_visitor::nir_emit_if(nir_if *if_stmt) 295{ 296 /* first, put the condition into f0 */ 297 fs_inst *inst = bld.MOV(bld.null_reg_d(), 298 retype(get_nir_src(if_stmt->condition), 299 BRW_REGISTER_TYPE_D)); 300 inst->conditional_mod = BRW_CONDITIONAL_NZ; 301 302 bld.IF(BRW_PREDICATE_NORMAL); 303 304 nir_emit_cf_list(&if_stmt->then_list); 305 306 /* note: if the else is empty, dead CF elimination will remove it */ 307 bld.emit(BRW_OPCODE_ELSE); 308 309 nir_emit_cf_list(&if_stmt->else_list); 310 311 bld.emit(BRW_OPCODE_ENDIF); 312} 313 314void 315fs_visitor::nir_emit_loop(nir_loop *loop) 316{ 317 bld.emit(BRW_OPCODE_DO); 318 319 nir_emit_cf_list(&loop->body); 320 321 bld.emit(BRW_OPCODE_WHILE); 322} 323 324void 325fs_visitor::nir_emit_block(nir_block *block) 326{ 327 nir_foreach_instr(instr, block) { 328 nir_emit_instr(instr); 329 } 330} 331 332void 333fs_visitor::nir_emit_instr(nir_instr *instr) 334{ 335 const fs_builder abld = bld.annotate(NULL, instr); 336 337 switch (instr->type) { 338 case nir_instr_type_alu: 339 nir_emit_alu(abld, nir_instr_as_alu(instr)); 340 break; 341 342 case nir_instr_type_intrinsic: 343 switch (stage) { 344 case MESA_SHADER_VERTEX: 345 nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 346 break; 347 case MESA_SHADER_TESS_CTRL: 348 nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 349 break; 350 case MESA_SHADER_TESS_EVAL: 351 nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr)); 352 break; 353 case MESA_SHADER_GEOMETRY: 354 nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 355 break; 356 case MESA_SHADER_FRAGMENT: 357 nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 358 break; 359 case MESA_SHADER_COMPUTE: 360 nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 361 break; 362 default: 363 unreachable("unsupported shader stage"); 364 } 365 break; 366 367 case nir_instr_type_tex: 368 nir_emit_texture(abld, nir_instr_as_tex(instr)); 369 break; 370 371 case nir_instr_type_load_const: 372 nir_emit_load_const(abld, nir_instr_as_load_const(instr)); 373 break; 374 375 case nir_instr_type_ssa_undef: 376 /* We create a new VGRF for undefs on every use (by handling 377 * them in get_nir_src()), rather than for each definition. 378 * This helps register coalescing eliminate MOVs from undef. 379 */ 380 break; 381 382 case nir_instr_type_jump: 383 nir_emit_jump(abld, nir_instr_as_jump(instr)); 384 break; 385 386 default: 387 unreachable("unknown instruction type"); 388 } 389} 390 391/** 392 * Recognizes a parent instruction of nir_op_extract_* and changes the type to 393 * match instr. 394 */ 395bool 396fs_visitor::optimize_extract_to_float(nir_alu_instr *instr, 397 const fs_reg &result) 398{ 399 if (!instr->src[0].src.is_ssa || 400 !instr->src[0].src.ssa->parent_instr) 401 return false; 402 403 if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu) 404 return false; 405 406 nir_alu_instr *src0 = 407 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); 408 409 if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 && 410 src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16) 411 return false; 412 413 nir_const_value *element = nir_src_as_const_value(src0->src[1].src); 414 assert(element != NULL); 415 416 /* Element type to extract.*/ 417 const brw_reg_type type = brw_int_type( 418 src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1, 419 src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8); 420 421 fs_reg op0 = get_nir_src(src0->src[0].src); 422 op0.type = brw_type_for_nir_type( 423 (nir_alu_type)(nir_op_infos[src0->op].input_types[0] | 424 nir_src_bit_size(src0->src[0].src))); 425 op0 = offset(op0, bld, src0->src[0].swizzle[0]); 426 427 set_saturate(instr->dest.saturate, 428 bld.MOV(result, subscript(op0, type, element->u32[0]))); 429 return true; 430} 431 432bool 433fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr, 434 const fs_reg &result) 435{ 436 if (!instr->src[0].src.is_ssa || 437 instr->src[0].src.ssa->parent_instr->type != nir_instr_type_intrinsic) 438 return false; 439 440 nir_intrinsic_instr *src0 = 441 nir_instr_as_intrinsic(instr->src[0].src.ssa->parent_instr); 442 443 if (src0->intrinsic != nir_intrinsic_load_front_face) 444 return false; 445 446 nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src); 447 if (!value1 || fabsf(value1->f32[0]) != 1.0f) 448 return false; 449 450 nir_const_value *value2 = nir_src_as_const_value(instr->src[2].src); 451 if (!value2 || fabsf(value2->f32[0]) != 1.0f) 452 return false; 453 454 fs_reg tmp = vgrf(glsl_type::int_type); 455 456 if (devinfo->gen >= 6) { 457 /* Bit 15 of g0.0 is 0 if the polygon is front facing. */ 458 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W)); 459 460 /* For (gl_FrontFacing ? 1.0 : -1.0), emit: 461 * 462 * or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W 463 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D 464 * 465 * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0). 466 * 467 * This negation looks like it's safe in practice, because bits 0:4 will 468 * surely be TRIANGLES 469 */ 470 471 if (value1->f32[0] == -1.0f) { 472 g0.negate = true; 473 } 474 475 bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1), 476 g0, brw_imm_uw(0x3f80)); 477 } else { 478 /* Bit 31 of g1.6 is 0 if the polygon is front facing. */ 479 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D)); 480 481 /* For (gl_FrontFacing ? 1.0 : -1.0), emit: 482 * 483 * or(8) tmp<1>D g1.6<0,1,0>D 0x3f800000D 484 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D 485 * 486 * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0). 487 * 488 * This negation looks like it's safe in practice, because bits 0:4 will 489 * surely be TRIANGLES 490 */ 491 492 if (value1->f32[0] == -1.0f) { 493 g1_6.negate = true; 494 } 495 496 bld.OR(tmp, g1_6, brw_imm_d(0x3f800000)); 497 } 498 bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000)); 499 500 return true; 501} 502 503static void 504emit_find_msb_using_lzd(const fs_builder &bld, 505 const fs_reg &result, 506 const fs_reg &src, 507 bool is_signed) 508{ 509 fs_inst *inst; 510 fs_reg temp = src; 511 512 if (is_signed) { 513 /* LZD of an absolute value source almost always does the right 514 * thing. There are two problem values: 515 * 516 * * 0x80000000. Since abs(0x80000000) == 0x80000000, LZD returns 517 * 0. However, findMSB(int(0x80000000)) == 30. 518 * 519 * * 0xffffffff. Since abs(0xffffffff) == 1, LZD returns 520 * 31. Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 521 * 522 * For a value of zero or negative one, -1 will be returned. 523 * 524 * * Negative powers of two. LZD(abs(-(1<<x))) returns x, but 525 * findMSB(-(1<<x)) should return x-1. 526 * 527 * For all negative number cases, including 0x80000000 and 528 * 0xffffffff, the correct value is obtained from LZD if instead of 529 * negating the (already negative) value the logical-not is used. A 530 * conditonal logical-not can be achieved in two instructions. 531 */ 532 temp = bld.vgrf(BRW_REGISTER_TYPE_D); 533 534 bld.ASR(temp, src, brw_imm_d(31)); 535 bld.XOR(temp, temp, src); 536 } 537 538 bld.LZD(retype(result, BRW_REGISTER_TYPE_UD), 539 retype(temp, BRW_REGISTER_TYPE_UD)); 540 541 /* LZD counts from the MSB side, while GLSL's findMSB() wants the count 542 * from the LSB side. Subtract the result from 31 to convert the MSB 543 * count into an LSB count. If no bits are set, LZD will return 32. 544 * 31-32 = -1, which is exactly what findMSB() is supposed to return. 545 */ 546 inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31)); 547 inst->src[0].negate = true; 548} 549 550void 551fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) 552{ 553 struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key; 554 fs_inst *inst; 555 556 fs_reg result = get_nir_dest(instr->dest.dest); 557 result.type = brw_type_for_nir_type( 558 (nir_alu_type)(nir_op_infos[instr->op].output_type | 559 nir_dest_bit_size(instr->dest.dest))); 560 561 fs_reg op[4]; 562 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 563 op[i] = get_nir_src(instr->src[i].src); 564 op[i].type = brw_type_for_nir_type( 565 (nir_alu_type)(nir_op_infos[instr->op].input_types[i] | 566 nir_src_bit_size(instr->src[i].src))); 567 op[i].abs = instr->src[i].abs; 568 op[i].negate = instr->src[i].negate; 569 } 570 571 /* We get a bunch of mov's out of the from_ssa pass and they may still 572 * be vectorized. We'll handle them as a special-case. We'll also 573 * handle vecN here because it's basically the same thing. 574 */ 575 switch (instr->op) { 576 case nir_op_imov: 577 case nir_op_fmov: 578 case nir_op_vec2: 579 case nir_op_vec3: 580 case nir_op_vec4: { 581 fs_reg temp = result; 582 bool need_extra_copy = false; 583 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 584 if (!instr->src[i].src.is_ssa && 585 instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) { 586 need_extra_copy = true; 587 temp = bld.vgrf(result.type, 4); 588 break; 589 } 590 } 591 592 for (unsigned i = 0; i < 4; i++) { 593 if (!(instr->dest.write_mask & (1 << i))) 594 continue; 595 596 if (instr->op == nir_op_imov || instr->op == nir_op_fmov) { 597 inst = bld.MOV(offset(temp, bld, i), 598 offset(op[0], bld, instr->src[0].swizzle[i])); 599 } else { 600 inst = bld.MOV(offset(temp, bld, i), 601 offset(op[i], bld, instr->src[i].swizzle[0])); 602 } 603 inst->saturate = instr->dest.saturate; 604 } 605 606 /* In this case the source and destination registers were the same, 607 * so we need to insert an extra set of moves in order to deal with 608 * any swizzling. 609 */ 610 if (need_extra_copy) { 611 for (unsigned i = 0; i < 4; i++) { 612 if (!(instr->dest.write_mask & (1 << i))) 613 continue; 614 615 bld.MOV(offset(result, bld, i), offset(temp, bld, i)); 616 } 617 } 618 return; 619 } 620 default: 621 break; 622 } 623 624 /* At this point, we have dealt with any instruction that operates on 625 * more than a single channel. Therefore, we can just adjust the source 626 * and destination registers for that channel and emit the instruction. 627 */ 628 unsigned channel = 0; 629 if (nir_op_infos[instr->op].output_size == 0) { 630 /* Since NIR is doing the scalarizing for us, we should only ever see 631 * vectorized operations with a single channel. 632 */ 633 assert(_mesa_bitcount(instr->dest.write_mask) == 1); 634 channel = ffs(instr->dest.write_mask) - 1; 635 636 result = offset(result, bld, channel); 637 } 638 639 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 640 assert(nir_op_infos[instr->op].input_sizes[i] < 2); 641 op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]); 642 } 643 644 switch (instr->op) { 645 case nir_op_i2f: 646 case nir_op_u2f: 647 if (optimize_extract_to_float(instr, result)) 648 return; 649 inst = bld.MOV(result, op[0]); 650 inst->saturate = instr->dest.saturate; 651 break; 652 653 case nir_op_f2d: 654 case nir_op_i2d: 655 case nir_op_u2d: 656 /* CHV PRM, vol07, 3D Media GPGPU Engine, Register Region Restrictions: 657 * 658 * "When source or destination is 64b (...), regioning in Align1 659 * must follow these rules: 660 * 661 * 1. Source and destination horizontal stride must be aligned to 662 * the same qword. 663 * (...)" 664 * 665 * This means that 32-bit to 64-bit conversions need to have the 32-bit 666 * data elements aligned to 64-bit. This restriction does not apply to 667 * BDW and later. 668 */ 669 if (devinfo->is_cherryview || devinfo->is_broxton) { 670 fs_reg tmp = bld.vgrf(result.type, 1); 671 tmp = subscript(tmp, op[0].type, 0); 672 inst = bld.MOV(tmp, op[0]); 673 inst = bld.MOV(result, tmp); 674 inst->saturate = instr->dest.saturate; 675 break; 676 } 677 /* fallthrough */ 678 case nir_op_d2f: 679 case nir_op_d2i: 680 case nir_op_d2u: 681 inst = bld.MOV(result, op[0]); 682 inst->saturate = instr->dest.saturate; 683 break; 684 685 case nir_op_f2i: 686 case nir_op_f2u: 687 bld.MOV(result, op[0]); 688 break; 689 690 case nir_op_fsign: { 691 if (type_sz(op[0].type) < 8) { 692 /* AND(val, 0x80000000) gives the sign bit. 693 * 694 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not 695 * zero. 696 */ 697 bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ); 698 699 fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD); 700 op[0].type = BRW_REGISTER_TYPE_UD; 701 result.type = BRW_REGISTER_TYPE_UD; 702 bld.AND(result_int, op[0], brw_imm_ud(0x80000000u)); 703 704 inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u)); 705 inst->predicate = BRW_PREDICATE_NORMAL; 706 if (instr->dest.saturate) { 707 inst = bld.MOV(result, result); 708 inst->saturate = true; 709 } 710 } else { 711 /* For doubles we do the same but we need to consider: 712 * 713 * - 2-src instructions can't operate with 64-bit immediates 714 * - The sign is encoded in the high 32-bit of each DF 715 * - CMP with DF requires special handling in SIMD16 716 * - We need to produce a DF result. 717 */ 718 719 /* 2-src instructions can't have 64-bit immediates, so put 0.0 in 720 * a register and compare with that. 721 */ 722 fs_reg tmp = vgrf(glsl_type::double_type); 723 bld.MOV(tmp, setup_imm_df(bld, 0.0)); 724 725 /* A direct DF CMP using the flag register (null dst) won't work in 726 * SIMD16 because the CMP will be split in two by lower_simd_width, 727 * resulting in two CMP instructions with the same dst (NULL), 728 * leading to dead code elimination of the first one. In SIMD8, 729 * however, there is no need to split the CMP and we can save some 730 * work. 731 */ 732 fs_reg dst_tmp = vgrf(glsl_type::double_type); 733 bld.CMP(dst_tmp, op[0], tmp, BRW_CONDITIONAL_NZ); 734 735 /* In SIMD16 we want to avoid using a NULL dst register with DF CMP, 736 * so we store the result of the comparison in a vgrf instead and 737 * then we generate a UD comparison from that that won't have to 738 * be split by lower_simd_width. This is what NIR does to handle 739 * double comparisons in the general case. 740 */ 741 if (bld.dispatch_width() == 16 ) { 742 fs_reg dst_tmp_ud = retype(dst_tmp, BRW_REGISTER_TYPE_UD); 743 bld.MOV(dst_tmp_ud, subscript(dst_tmp, BRW_REGISTER_TYPE_UD, 0)); 744 bld.CMP(bld.null_reg_ud(), 745 dst_tmp_ud, brw_imm_ud(0), BRW_CONDITIONAL_NZ); 746 } 747 748 /* Get the high 32-bit of each double component where the sign is */ 749 fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD); 750 bld.MOV(result_int, subscript(op[0], BRW_REGISTER_TYPE_UD, 1)); 751 752 /* Get the sign bit */ 753 bld.AND(result_int, result_int, brw_imm_ud(0x80000000u)); 754 755 /* Add 1.0 to the sign, predicated to skip the case of op[0] == 0.0 */ 756 inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u)); 757 inst->predicate = BRW_PREDICATE_NORMAL; 758 759 /* Convert from 32-bit float to 64-bit double */ 760 result.type = BRW_REGISTER_TYPE_DF; 761 inst = bld.MOV(result, retype(result_int, BRW_REGISTER_TYPE_F)); 762 763 if (instr->dest.saturate) { 764 inst = bld.MOV(result, result); 765 inst->saturate = true; 766 } 767 } 768 break; 769 } 770 771 case nir_op_isign: 772 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1). 773 * -> non-negative val generates 0x00000000. 774 * Predicated OR sets 1 if val is positive. 775 */ 776 assert(nir_dest_bit_size(instr->dest.dest) < 64); 777 bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_G); 778 bld.ASR(result, op[0], brw_imm_d(31)); 779 inst = bld.OR(result, result, brw_imm_d(1)); 780 inst->predicate = BRW_PREDICATE_NORMAL; 781 break; 782 783 case nir_op_frcp: 784 inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]); 785 inst->saturate = instr->dest.saturate; 786 break; 787 788 case nir_op_fexp2: 789 inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]); 790 inst->saturate = instr->dest.saturate; 791 break; 792 793 case nir_op_flog2: 794 inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]); 795 inst->saturate = instr->dest.saturate; 796 break; 797 798 case nir_op_fsin: 799 inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]); 800 inst->saturate = instr->dest.saturate; 801 break; 802 803 case nir_op_fcos: 804 inst = bld.emit(SHADER_OPCODE_COS, result, op[0]); 805 inst->saturate = instr->dest.saturate; 806 break; 807 808 case nir_op_fddx: 809 if (fs_key->high_quality_derivatives) { 810 inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]); 811 } else { 812 inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]); 813 } 814 inst->saturate = instr->dest.saturate; 815 break; 816 case nir_op_fddx_fine: 817 inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]); 818 inst->saturate = instr->dest.saturate; 819 break; 820 case nir_op_fddx_coarse: 821 inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]); 822 inst->saturate = instr->dest.saturate; 823 break; 824 case nir_op_fddy: 825 if (fs_key->high_quality_derivatives) { 826 inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]); 827 } else { 828 inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]); 829 } 830 inst->saturate = instr->dest.saturate; 831 break; 832 case nir_op_fddy_fine: 833 inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]); 834 inst->saturate = instr->dest.saturate; 835 break; 836 case nir_op_fddy_coarse: 837 inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]); 838 inst->saturate = instr->dest.saturate; 839 break; 840 841 case nir_op_iadd: 842 assert(nir_dest_bit_size(instr->dest.dest) < 64); 843 case nir_op_fadd: 844 inst = bld.ADD(result, op[0], op[1]); 845 inst->saturate = instr->dest.saturate; 846 break; 847 848 case nir_op_fmul: 849 inst = bld.MUL(result, op[0], op[1]); 850 inst->saturate = instr->dest.saturate; 851 break; 852 853 case nir_op_imul: 854 assert(nir_dest_bit_size(instr->dest.dest) < 64); 855 bld.MUL(result, op[0], op[1]); 856 break; 857 858 case nir_op_imul_high: 859 case nir_op_umul_high: 860 assert(nir_dest_bit_size(instr->dest.dest) < 64); 861 bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]); 862 break; 863 864 case nir_op_idiv: 865 case nir_op_udiv: 866 assert(nir_dest_bit_size(instr->dest.dest) < 64); 867 bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]); 868 break; 869 870 case nir_op_uadd_carry: 871 unreachable("Should have been lowered by carry_to_arith()."); 872 873 case nir_op_usub_borrow: 874 unreachable("Should have been lowered by borrow_to_arith()."); 875 876 case nir_op_umod: 877 case nir_op_irem: 878 /* According to the sign table for INT DIV in the Ivy Bridge PRM, it 879 * appears that our hardware just does the right thing for signed 880 * remainder. 881 */ 882 assert(nir_dest_bit_size(instr->dest.dest) < 64); 883 bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]); 884 break; 885 886 case nir_op_imod: { 887 /* Get a regular C-style remainder. If a % b == 0, set the predicate. */ 888 bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]); 889 890 /* Math instructions don't support conditional mod */ 891 inst = bld.MOV(bld.null_reg_d(), result); 892 inst->conditional_mod = BRW_CONDITIONAL_NZ; 893 894 /* Now, we need to determine if signs of the sources are different. 895 * When we XOR the sources, the top bit is 0 if they are the same and 1 896 * if they are different. We can then use a conditional modifier to 897 * turn that into a predicate. This leads us to an XOR.l instruction. 898 * 899 * Technically, according to the PRM, you're not allowed to use .l on a 900 * XOR instruction. However, emperical experiments and Curro's reading 901 * of the simulator source both indicate that it's safe. 902 */ 903 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D); 904 inst = bld.XOR(tmp, op[0], op[1]); 905 inst->predicate = BRW_PREDICATE_NORMAL; 906 inst->conditional_mod = BRW_CONDITIONAL_L; 907 908 /* If the result of the initial remainder operation is non-zero and the 909 * two sources have different signs, add in a copy of op[1] to get the 910 * final integer modulus value. 911 */ 912 inst = bld.ADD(result, result, op[1]); 913 inst->predicate = BRW_PREDICATE_NORMAL; 914 break; 915 } 916 917 case nir_op_flt: 918 case nir_op_fge: 919 case nir_op_feq: 920 case nir_op_fne: { 921 fs_reg dest = result; 922 if (nir_src_bit_size(instr->src[0].src) > 32) { 923 dest = bld.vgrf(BRW_REGISTER_TYPE_DF, 1); 924 } 925 brw_conditional_mod cond; 926 switch (instr->op) { 927 case nir_op_flt: 928 cond = BRW_CONDITIONAL_L; 929 break; 930 case nir_op_fge: 931 cond = BRW_CONDITIONAL_GE; 932 break; 933 case nir_op_feq: 934 cond = BRW_CONDITIONAL_Z; 935 break; 936 case nir_op_fne: 937 cond = BRW_CONDITIONAL_NZ; 938 break; 939 default: 940 unreachable("bad opcode"); 941 } 942 bld.CMP(dest, op[0], op[1], cond); 943 if (nir_src_bit_size(instr->src[0].src) > 32) { 944 bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0)); 945 } 946 break; 947 } 948 949 case nir_op_ilt: 950 case nir_op_ult: 951 assert(nir_dest_bit_size(instr->dest.dest) < 64); 952 bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_L); 953 break; 954 955 case nir_op_ige: 956 case nir_op_uge: 957 assert(nir_dest_bit_size(instr->dest.dest) < 64); 958 bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_GE); 959 break; 960 961 case nir_op_ieq: 962 assert(nir_dest_bit_size(instr->dest.dest) < 64); 963 bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_Z); 964 break; 965 966 case nir_op_ine: 967 assert(nir_dest_bit_size(instr->dest.dest) < 64); 968 bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_NZ); 969 break; 970 971 case nir_op_inot: 972 assert(nir_dest_bit_size(instr->dest.dest) < 64); 973 if (devinfo->gen >= 8) { 974 op[0] = resolve_source_modifiers(op[0]); 975 } 976 bld.NOT(result, op[0]); 977 break; 978 case nir_op_ixor: 979 assert(nir_dest_bit_size(instr->dest.dest) < 64); 980 if (devinfo->gen >= 8) { 981 op[0] = resolve_source_modifiers(op[0]); 982 op[1] = resolve_source_modifiers(op[1]); 983 } 984 bld.XOR(result, op[0], op[1]); 985 break; 986 case nir_op_ior: 987 assert(nir_dest_bit_size(instr->dest.dest) < 64); 988 if (devinfo->gen >= 8) { 989 op[0] = resolve_source_modifiers(op[0]); 990 op[1] = resolve_source_modifiers(op[1]); 991 } 992 bld.OR(result, op[0], op[1]); 993 break; 994 case nir_op_iand: 995 assert(nir_dest_bit_size(instr->dest.dest) < 64); 996 if (devinfo->gen >= 8) { 997 op[0] = resolve_source_modifiers(op[0]); 998 op[1] = resolve_source_modifiers(op[1]); 999 } 1000 bld.AND(result, op[0], op[1]); 1001 break; 1002 1003 case nir_op_fdot2: 1004 case nir_op_fdot3: 1005 case nir_op_fdot4: 1006 case nir_op_ball_fequal2: 1007 case nir_op_ball_iequal2: 1008 case nir_op_ball_fequal3: 1009 case nir_op_ball_iequal3: 1010 case nir_op_ball_fequal4: 1011 case nir_op_ball_iequal4: 1012 case nir_op_bany_fnequal2: 1013 case nir_op_bany_inequal2: 1014 case nir_op_bany_fnequal3: 1015 case nir_op_bany_inequal3: 1016 case nir_op_bany_fnequal4: 1017 case nir_op_bany_inequal4: 1018 unreachable("Lowered by nir_lower_alu_reductions"); 1019 1020 case nir_op_fnoise1_1: 1021 case nir_op_fnoise1_2: 1022 case nir_op_fnoise1_3: 1023 case nir_op_fnoise1_4: 1024 case nir_op_fnoise2_1: 1025 case nir_op_fnoise2_2: 1026 case nir_op_fnoise2_3: 1027 case nir_op_fnoise2_4: 1028 case nir_op_fnoise3_1: 1029 case nir_op_fnoise3_2: 1030 case nir_op_fnoise3_3: 1031 case nir_op_fnoise3_4: 1032 case nir_op_fnoise4_1: 1033 case nir_op_fnoise4_2: 1034 case nir_op_fnoise4_3: 1035 case nir_op_fnoise4_4: 1036 unreachable("not reached: should be handled by lower_noise"); 1037 1038 case nir_op_ldexp: 1039 unreachable("not reached: should be handled by ldexp_to_arith()"); 1040 1041 case nir_op_fsqrt: 1042 inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]); 1043 inst->saturate = instr->dest.saturate; 1044 break; 1045 1046 case nir_op_frsq: 1047 inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]); 1048 inst->saturate = instr->dest.saturate; 1049 break; 1050 1051 case nir_op_b2i: 1052 case nir_op_b2f: 1053 bld.MOV(result, negate(op[0])); 1054 break; 1055 1056 case nir_op_f2b: 1057 bld.CMP(result, op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ); 1058 break; 1059 case nir_op_d2b: { 1060 /* two-argument instructions can't take 64-bit immediates */ 1061 fs_reg zero = vgrf(glsl_type::double_type); 1062 bld.MOV(zero, setup_imm_df(bld, 0.0)); 1063 /* A SIMD16 execution needs to be split in two instructions, so use 1064 * a vgrf instead of the flag register as dst so instruction splitting 1065 * works 1066 */ 1067 fs_reg tmp = vgrf(glsl_type::double_type); 1068 bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ); 1069 bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0)); 1070 break; 1071 } 1072 case nir_op_i2b: 1073 bld.CMP(result, op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ); 1074 break; 1075 1076 case nir_op_ftrunc: 1077 inst = bld.RNDZ(result, op[0]); 1078 inst->saturate = instr->dest.saturate; 1079 break; 1080 1081 case nir_op_fceil: { 1082 op[0].negate = !op[0].negate; 1083 fs_reg temp = vgrf(glsl_type::float_type); 1084 bld.RNDD(temp, op[0]); 1085 temp.negate = true; 1086 inst = bld.MOV(result, temp); 1087 inst->saturate = instr->dest.saturate; 1088 break; 1089 } 1090 case nir_op_ffloor: 1091 inst = bld.RNDD(result, op[0]); 1092 inst->saturate = instr->dest.saturate; 1093 break; 1094 case nir_op_ffract: 1095 inst = bld.FRC(result, op[0]); 1096 inst->saturate = instr->dest.saturate; 1097 break; 1098 case nir_op_fround_even: 1099 inst = bld.RNDE(result, op[0]); 1100 inst->saturate = instr->dest.saturate; 1101 break; 1102 1103 case nir_op_fquantize2f16: { 1104 fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D); 1105 fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F); 1106 fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F); 1107 1108 /* The destination stride must be at least as big as the source stride. */ 1109 tmp16.type = BRW_REGISTER_TYPE_W; 1110 tmp16.stride = 2; 1111 1112 /* Check for denormal */ 1113 fs_reg abs_src0 = op[0]; 1114 abs_src0.abs = true; 1115 bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)), 1116 BRW_CONDITIONAL_L); 1117 /* Get the appropriately signed zero */ 1118 bld.AND(retype(zero, BRW_REGISTER_TYPE_UD), 1119 retype(op[0], BRW_REGISTER_TYPE_UD), 1120 brw_imm_ud(0x80000000)); 1121 /* Do the actual F32 -> F16 -> F32 conversion */ 1122 bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]); 1123 bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16); 1124 /* Select that or zero based on normal status */ 1125 inst = bld.SEL(result, zero, tmp32); 1126 inst->predicate = BRW_PREDICATE_NORMAL; 1127 inst->saturate = instr->dest.saturate; 1128 break; 1129 } 1130 1131 case nir_op_imin: 1132 case nir_op_umin: 1133 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1134 case nir_op_fmin: 1135 inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L); 1136 inst->saturate = instr->dest.saturate; 1137 break; 1138 1139 case nir_op_imax: 1140 case nir_op_umax: 1141 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1142 case nir_op_fmax: 1143 inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE); 1144 inst->saturate = instr->dest.saturate; 1145 break; 1146 1147 case nir_op_pack_snorm_2x16: 1148 case nir_op_pack_snorm_4x8: 1149 case nir_op_pack_unorm_2x16: 1150 case nir_op_pack_unorm_4x8: 1151 case nir_op_unpack_snorm_2x16: 1152 case nir_op_unpack_snorm_4x8: 1153 case nir_op_unpack_unorm_2x16: 1154 case nir_op_unpack_unorm_4x8: 1155 case nir_op_unpack_half_2x16: 1156 case nir_op_pack_half_2x16: 1157 unreachable("not reached: should be handled by lower_packing_builtins"); 1158 1159 case nir_op_unpack_half_2x16_split_x: 1160 inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, result, op[0]); 1161 inst->saturate = instr->dest.saturate; 1162 break; 1163 case nir_op_unpack_half_2x16_split_y: 1164 inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, result, op[0]); 1165 inst->saturate = instr->dest.saturate; 1166 break; 1167 1168 case nir_op_pack_double_2x32_split: 1169 bld.emit(FS_OPCODE_PACK, result, op[0], op[1]); 1170 break; 1171 1172 case nir_op_unpack_double_2x32_split_x: 1173 case nir_op_unpack_double_2x32_split_y: { 1174 /* Optimize the common case where we are unpacking from a double we have 1175 * previously packed. In this case we can just bypass the pack operation 1176 * and source directly from its arguments. 1177 */ 1178 unsigned index = (instr->op == nir_op_unpack_double_2x32_split_x) ? 0 : 1; 1179 if (instr->src[0].src.is_ssa) { 1180 nir_instr *parent_instr = instr->src[0].src.ssa->parent_instr; 1181 if (parent_instr->type == nir_instr_type_alu) { 1182 nir_alu_instr *alu_parent = nir_instr_as_alu(parent_instr); 1183 if (alu_parent->op == nir_op_pack_double_2x32_split && 1184 alu_parent->src[index].src.is_ssa) { 1185 op[0] = retype(get_nir_src(alu_parent->src[index].src), 1186 BRW_REGISTER_TYPE_UD); 1187 op[0] = 1188 offset(op[0], bld, alu_parent->src[index].swizzle[channel]); 1189 bld.MOV(result, op[0]); 1190 break; 1191 } 1192 } 1193 } 1194 1195 if (instr->op == nir_op_unpack_double_2x32_split_x) 1196 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0)); 1197 else 1198 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1)); 1199 break; 1200 } 1201 1202 case nir_op_fpow: 1203 inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]); 1204 inst->saturate = instr->dest.saturate; 1205 break; 1206 1207 case nir_op_bitfield_reverse: 1208 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1209 bld.BFREV(result, op[0]); 1210 break; 1211 1212 case nir_op_bit_count: 1213 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1214 bld.CBIT(result, op[0]); 1215 break; 1216 1217 case nir_op_ufind_msb: { 1218 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1219 emit_find_msb_using_lzd(bld, result, op[0], false); 1220 break; 1221 } 1222 1223 case nir_op_ifind_msb: { 1224 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1225 1226 if (devinfo->gen < 7) { 1227 emit_find_msb_using_lzd(bld, result, op[0], true); 1228 } else { 1229 bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]); 1230 1231 /* FBH counts from the MSB side, while GLSL's findMSB() wants the 1232 * count from the LSB side. If FBH didn't return an error 1233 * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB 1234 * count into an LSB count. 1235 */ 1236 bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ); 1237 1238 inst = bld.ADD(result, result, brw_imm_d(31)); 1239 inst->predicate = BRW_PREDICATE_NORMAL; 1240 inst->src[0].negate = true; 1241 } 1242 break; 1243 } 1244 1245 case nir_op_find_lsb: 1246 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1247 1248 if (devinfo->gen < 7) { 1249 fs_reg temp = vgrf(glsl_type::int_type); 1250 1251 /* (x & -x) generates a value that consists of only the LSB of x. 1252 * For all powers of 2, findMSB(y) == findLSB(y). 1253 */ 1254 fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D); 1255 fs_reg negated_src = src; 1256 1257 /* One must be negated, and the other must be non-negated. It 1258 * doesn't matter which is which. 1259 */ 1260 negated_src.negate = true; 1261 src.negate = false; 1262 1263 bld.AND(temp, src, negated_src); 1264 emit_find_msb_using_lzd(bld, result, temp, false); 1265 } else { 1266 bld.FBL(result, op[0]); 1267 } 1268 break; 1269 1270 case nir_op_ubitfield_extract: 1271 case nir_op_ibitfield_extract: 1272 unreachable("should have been lowered"); 1273 case nir_op_ubfe: 1274 case nir_op_ibfe: 1275 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1276 bld.BFE(result, op[2], op[1], op[0]); 1277 break; 1278 case nir_op_bfm: 1279 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1280 bld.BFI1(result, op[0], op[1]); 1281 break; 1282 case nir_op_bfi: 1283 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1284 bld.BFI2(result, op[0], op[1], op[2]); 1285 break; 1286 1287 case nir_op_bitfield_insert: 1288 unreachable("not reached: should have been lowered"); 1289 1290 case nir_op_ishl: 1291 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1292 bld.SHL(result, op[0], op[1]); 1293 break; 1294 case nir_op_ishr: 1295 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1296 bld.ASR(result, op[0], op[1]); 1297 break; 1298 case nir_op_ushr: 1299 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1300 bld.SHR(result, op[0], op[1]); 1301 break; 1302 1303 case nir_op_pack_half_2x16_split: 1304 bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]); 1305 break; 1306 1307 case nir_op_ffma: 1308 inst = bld.MAD(result, op[2], op[1], op[0]); 1309 inst->saturate = instr->dest.saturate; 1310 break; 1311 1312 case nir_op_flrp: 1313 inst = bld.LRP(result, op[0], op[1], op[2]); 1314 inst->saturate = instr->dest.saturate; 1315 break; 1316 1317 case nir_op_bcsel: 1318 if (optimize_frontfacing_ternary(instr, result)) 1319 return; 1320 1321 bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ); 1322 inst = bld.SEL(result, op[1], op[2]); 1323 inst->predicate = BRW_PREDICATE_NORMAL; 1324 break; 1325 1326 case nir_op_extract_u8: 1327 case nir_op_extract_i8: { 1328 const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8); 1329 nir_const_value *byte = nir_src_as_const_value(instr->src[1].src); 1330 assert(byte != NULL); 1331 bld.MOV(result, subscript(op[0], type, byte->u32[0])); 1332 break; 1333 } 1334 1335 case nir_op_extract_u16: 1336 case nir_op_extract_i16: { 1337 const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16); 1338 nir_const_value *word = nir_src_as_const_value(instr->src[1].src); 1339 assert(word != NULL); 1340 bld.MOV(result, subscript(op[0], type, word->u32[0])); 1341 break; 1342 } 1343 1344 default: 1345 unreachable("unhandled instruction"); 1346 } 1347 1348 /* If we need to do a boolean resolve, replace the result with -(x & 1) 1349 * to sign extend the low bit to 0/~0 1350 */ 1351 if (devinfo->gen <= 5 && 1352 (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) { 1353 fs_reg masked = vgrf(glsl_type::int_type); 1354 bld.AND(masked, result, brw_imm_d(1)); 1355 masked.negate = true; 1356 bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked); 1357 } 1358} 1359 1360void 1361fs_visitor::nir_emit_load_const(const fs_builder &bld, 1362 nir_load_const_instr *instr) 1363{ 1364 const brw_reg_type reg_type = 1365 instr->def.bit_size == 32 ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF; 1366 fs_reg reg = bld.vgrf(reg_type, instr->def.num_components); 1367 1368 switch (instr->def.bit_size) { 1369 case 32: 1370 for (unsigned i = 0; i < instr->def.num_components; i++) 1371 bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i32[i])); 1372 break; 1373 1374 case 64: 1375 for (unsigned i = 0; i < instr->def.num_components; i++) 1376 bld.MOV(offset(reg, bld, i), 1377 setup_imm_df(bld, instr->value.f64[i])); 1378 break; 1379 1380 default: 1381 unreachable("Invalid bit size"); 1382 } 1383 1384 nir_ssa_values[instr->def.index] = reg; 1385} 1386 1387fs_reg 1388fs_visitor::get_nir_src(const nir_src &src) 1389{ 1390 fs_reg reg; 1391 if (src.is_ssa) { 1392 if (src.ssa->parent_instr->type == nir_instr_type_ssa_undef) { 1393 const brw_reg_type reg_type = src.ssa->bit_size == 32 ? 1394 BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF; 1395 reg = bld.vgrf(reg_type, src.ssa->num_components); 1396 } else { 1397 reg = nir_ssa_values[src.ssa->index]; 1398 } 1399 } else { 1400 /* We don't handle indirects on locals */ 1401 assert(src.reg.indirect == NULL); 1402 reg = offset(nir_locals[src.reg.reg->index], bld, 1403 src.reg.base_offset * src.reg.reg->num_components); 1404 } 1405 1406 /* to avoid floating-point denorm flushing problems, set the type by 1407 * default to D - instructions that need floating point semantics will set 1408 * this to F if they need to 1409 */ 1410 return retype(reg, BRW_REGISTER_TYPE_D); 1411} 1412 1413/** 1414 * Return an IMM for constants; otherwise call get_nir_src() as normal. 1415 */ 1416fs_reg 1417fs_visitor::get_nir_src_imm(const nir_src &src) 1418{ 1419 nir_const_value *val = nir_src_as_const_value(src); 1420 return val ? fs_reg(brw_imm_d(val->i32[0])) : get_nir_src(src); 1421} 1422 1423fs_reg 1424fs_visitor::get_nir_dest(const nir_dest &dest) 1425{ 1426 if (dest.is_ssa) { 1427 const brw_reg_type reg_type = 1428 dest.ssa.bit_size == 32 ? BRW_REGISTER_TYPE_F : BRW_REGISTER_TYPE_DF; 1429 nir_ssa_values[dest.ssa.index] = 1430 bld.vgrf(reg_type, dest.ssa.num_components); 1431 return nir_ssa_values[dest.ssa.index]; 1432 } else { 1433 /* We don't handle indirects on locals */ 1434 assert(dest.reg.indirect == NULL); 1435 return offset(nir_locals[dest.reg.reg->index], bld, 1436 dest.reg.base_offset * dest.reg.reg->num_components); 1437 } 1438} 1439 1440fs_reg 1441fs_visitor::get_nir_image_deref(const nir_deref_var *deref) 1442{ 1443 fs_reg image(UNIFORM, deref->var->data.driver_location / 4, 1444 BRW_REGISTER_TYPE_UD); 1445 fs_reg indirect; 1446 unsigned indirect_max = 0; 1447 1448 for (const nir_deref *tail = &deref->deref; tail->child; 1449 tail = tail->child) { 1450 const nir_deref_array *deref_array = nir_deref_as_array(tail->child); 1451 assert(tail->child->deref_type == nir_deref_type_array); 1452 const unsigned size = glsl_get_length(tail->type); 1453 const unsigned element_size = type_size_scalar(deref_array->deref.type); 1454 const unsigned base = MIN2(deref_array->base_offset, size - 1); 1455 image = offset(image, bld, base * element_size); 1456 1457 if (deref_array->deref_array_type == nir_deref_array_type_indirect) { 1458 fs_reg tmp = vgrf(glsl_type::uint_type); 1459 1460 /* Accessing an invalid surface index with the dataport can result 1461 * in a hang. According to the spec "if the index used to 1462 * select an individual element is negative or greater than or 1463 * equal to the size of the array, the results of the operation 1464 * are undefined but may not lead to termination" -- which is one 1465 * of the possible outcomes of the hang. Clamp the index to 1466 * prevent access outside of the array bounds. 1467 */ 1468 bld.emit_minmax(tmp, retype(get_nir_src(deref_array->indirect), 1469 BRW_REGISTER_TYPE_UD), 1470 brw_imm_ud(size - base - 1), BRW_CONDITIONAL_L); 1471 1472 indirect_max += element_size * (tail->type->length - 1); 1473 1474 bld.MUL(tmp, tmp, brw_imm_ud(element_size * 4)); 1475 if (indirect.file == BAD_FILE) { 1476 indirect = tmp; 1477 } else { 1478 bld.ADD(indirect, indirect, tmp); 1479 } 1480 } 1481 } 1482 1483 if (indirect.file == BAD_FILE) { 1484 return image; 1485 } else { 1486 /* Emit a pile of MOVs to load the uniform into a temporary. The 1487 * dead-code elimination pass will get rid of what we don't use. 1488 */ 1489 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, BRW_IMAGE_PARAM_SIZE); 1490 for (unsigned j = 0; j < BRW_IMAGE_PARAM_SIZE; j++) { 1491 bld.emit(SHADER_OPCODE_MOV_INDIRECT, 1492 offset(tmp, bld, j), offset(image, bld, j), 1493 indirect, brw_imm_ud((indirect_max + 1) * 4)); 1494 } 1495 return tmp; 1496 } 1497} 1498 1499void 1500fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst, 1501 unsigned wr_mask) 1502{ 1503 for (unsigned i = 0; i < 4; i++) { 1504 if (!((wr_mask >> i) & 1)) 1505 continue; 1506 1507 fs_inst *new_inst = new(mem_ctx) fs_inst(inst); 1508 new_inst->dst = offset(new_inst->dst, bld, i); 1509 for (unsigned j = 0; j < new_inst->sources; j++) 1510 if (new_inst->src[j].file == VGRF) 1511 new_inst->src[j] = offset(new_inst->src[j], bld, i); 1512 1513 bld.emit(new_inst); 1514 } 1515} 1516 1517/** 1518 * Get the matching channel register datatype for an image intrinsic of the 1519 * specified GLSL image type. 1520 */ 1521static brw_reg_type 1522get_image_base_type(const glsl_type *type) 1523{ 1524 switch ((glsl_base_type)type->sampled_type) { 1525 case GLSL_TYPE_UINT: 1526 return BRW_REGISTER_TYPE_UD; 1527 case GLSL_TYPE_INT: 1528 return BRW_REGISTER_TYPE_D; 1529 case GLSL_TYPE_FLOAT: 1530 return BRW_REGISTER_TYPE_F; 1531 default: 1532 unreachable("Not reached."); 1533 } 1534} 1535 1536/** 1537 * Get the appropriate atomic op for an image atomic intrinsic. 1538 */ 1539static unsigned 1540get_image_atomic_op(nir_intrinsic_op op, const glsl_type *type) 1541{ 1542 switch (op) { 1543 case nir_intrinsic_image_atomic_add: 1544 return BRW_AOP_ADD; 1545 case nir_intrinsic_image_atomic_min: 1546 return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ? 1547 BRW_AOP_IMIN : BRW_AOP_UMIN); 1548 case nir_intrinsic_image_atomic_max: 1549 return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ? 1550 BRW_AOP_IMAX : BRW_AOP_UMAX); 1551 case nir_intrinsic_image_atomic_and: 1552 return BRW_AOP_AND; 1553 case nir_intrinsic_image_atomic_or: 1554 return BRW_AOP_OR; 1555 case nir_intrinsic_image_atomic_xor: 1556 return BRW_AOP_XOR; 1557 case nir_intrinsic_image_atomic_exchange: 1558 return BRW_AOP_MOV; 1559 case nir_intrinsic_image_atomic_comp_swap: 1560 return BRW_AOP_CMPWR; 1561 default: 1562 unreachable("Not reachable."); 1563 } 1564} 1565 1566static fs_inst * 1567emit_pixel_interpolater_send(const fs_builder &bld, 1568 enum opcode opcode, 1569 const fs_reg &dst, 1570 const fs_reg &src, 1571 const fs_reg &desc, 1572 glsl_interp_mode interpolation) 1573{ 1574 struct brw_wm_prog_data *wm_prog_data = 1575 brw_wm_prog_data(bld.shader->stage_prog_data); 1576 fs_inst *inst; 1577 fs_reg payload; 1578 int mlen; 1579 1580 if (src.file == BAD_FILE) { 1581 /* Dummy payload */ 1582 payload = bld.vgrf(BRW_REGISTER_TYPE_F, 1); 1583 mlen = 1; 1584 } else { 1585 payload = src; 1586 mlen = 2 * bld.dispatch_width() / 8; 1587 } 1588 1589 inst = bld.emit(opcode, dst, payload, desc); 1590 inst->mlen = mlen; 1591 /* 2 floats per slot returned */ 1592 inst->size_written = 2 * dst.component_size(inst->exec_size); 1593 inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE; 1594 1595 wm_prog_data->pulls_bary = true; 1596 1597 return inst; 1598} 1599 1600/** 1601 * Computes 1 << x, given a D/UD register containing some value x. 1602 */ 1603static fs_reg 1604intexp2(const fs_builder &bld, const fs_reg &x) 1605{ 1606 assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D); 1607 1608 fs_reg result = bld.vgrf(x.type, 1); 1609 fs_reg one = bld.vgrf(x.type, 1); 1610 1611 bld.MOV(one, retype(brw_imm_d(1), one.type)); 1612 bld.SHL(result, one, x); 1613 return result; 1614} 1615 1616void 1617fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src) 1618{ 1619 assert(stage == MESA_SHADER_GEOMETRY); 1620 1621 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 1622 1623 if (gs_compile->control_data_header_size_bits == 0) 1624 return; 1625 1626 /* We can only do EndPrimitive() functionality when the control data 1627 * consists of cut bits. Fortunately, the only time it isn't is when the 1628 * output type is points, in which case EndPrimitive() is a no-op. 1629 */ 1630 if (gs_prog_data->control_data_format != 1631 GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) { 1632 return; 1633 } 1634 1635 /* Cut bits use one bit per vertex. */ 1636 assert(gs_compile->control_data_bits_per_vertex == 1); 1637 1638 fs_reg vertex_count = get_nir_src(vertex_count_nir_src); 1639 vertex_count.type = BRW_REGISTER_TYPE_UD; 1640 1641 /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting 1642 * vertex n, 0 otherwise. So all we need to do here is mark bit 1643 * (vertex_count - 1) % 32 in the cut_bits register to indicate that 1644 * EndPrimitive() was called after emitting vertex (vertex_count - 1); 1645 * vec4_gs_visitor::emit_control_data_bits() will take care of the rest. 1646 * 1647 * Note that if EndPrimitive() is called before emitting any vertices, this 1648 * will cause us to set bit 31 of the control_data_bits register to 1. 1649 * That's fine because: 1650 * 1651 * - If max_vertices < 32, then vertex number 31 (zero-based) will never be 1652 * output, so the hardware will ignore cut bit 31. 1653 * 1654 * - If max_vertices == 32, then vertex number 31 is guaranteed to be the 1655 * last vertex, so setting cut bit 31 has no effect (since the primitive 1656 * is automatically ended when the GS terminates). 1657 * 1658 * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the 1659 * control_data_bits register to 0 when the first vertex is emitted. 1660 */ 1661 1662 const fs_builder abld = bld.annotate("end primitive"); 1663 1664 /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */ 1665 fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 1666 abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu)); 1667 fs_reg mask = intexp2(abld, prev_count); 1668 /* Note: we're relying on the fact that the GEN SHL instruction only pays 1669 * attention to the lower 5 bits of its second source argument, so on this 1670 * architecture, 1 << (vertex_count - 1) is equivalent to 1 << 1671 * ((vertex_count - 1) % 32). 1672 */ 1673 abld.OR(this->control_data_bits, this->control_data_bits, mask); 1674} 1675 1676void 1677fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count) 1678{ 1679 assert(stage == MESA_SHADER_GEOMETRY); 1680 assert(gs_compile->control_data_bits_per_vertex != 0); 1681 1682 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 1683 1684 const fs_builder abld = bld.annotate("emit control data bits"); 1685 const fs_builder fwa_bld = bld.exec_all(); 1686 1687 /* We use a single UD register to accumulate control data bits (32 bits 1688 * for each of the SIMD8 channels). So we need to write a DWord (32 bits) 1689 * at a time. 1690 * 1691 * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets. 1692 * We have select a 128-bit group via the Global and Per-Slot Offsets, then 1693 * use the Channel Mask phase to enable/disable which DWord within that 1694 * group to write. (Remember, different SIMD8 channels may have emitted 1695 * different numbers of vertices, so we may need per-slot offsets.) 1696 * 1697 * Channel masking presents an annoying problem: we may have to replicate 1698 * the data up to 4 times: 1699 * 1700 * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data. 1701 * 1702 * To avoid penalizing shaders that emit a small number of vertices, we 1703 * can avoid these sometimes: if the size of the control data header is 1704 * <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land 1705 * land in the same 128-bit group, so we can skip per-slot offsets. 1706 * 1707 * Similarly, if the control data header is <= 32 bits, there is only one 1708 * DWord, so we can skip channel masks. 1709 */ 1710 enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8; 1711 1712 fs_reg channel_mask, per_slot_offset; 1713 1714 if (gs_compile->control_data_header_size_bits > 32) { 1715 opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED; 1716 channel_mask = vgrf(glsl_type::uint_type); 1717 } 1718 1719 if (gs_compile->control_data_header_size_bits > 128) { 1720 opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT; 1721 per_slot_offset = vgrf(glsl_type::uint_type); 1722 } 1723 1724 /* Figure out which DWord we're trying to write to using the formula: 1725 * 1726 * dword_index = (vertex_count - 1) * bits_per_vertex / 32 1727 * 1728 * Since bits_per_vertex is a power of two, and is known at compile 1729 * time, this can be optimized to: 1730 * 1731 * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex)) 1732 */ 1733 if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) { 1734 fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 1735 fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 1736 abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu)); 1737 unsigned log2_bits_per_vertex = 1738 util_last_bit(gs_compile->control_data_bits_per_vertex); 1739 abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex)); 1740 1741 if (per_slot_offset.file != BAD_FILE) { 1742 /* Set the per-slot offset to dword_index / 4, so that we'll write to 1743 * the appropriate OWord within the control data header. 1744 */ 1745 abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u)); 1746 } 1747 1748 /* Set the channel masks to 1 << (dword_index % 4), so that we'll 1749 * write to the appropriate DWORD within the OWORD. 1750 */ 1751 fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 1752 fwa_bld.AND(channel, dword_index, brw_imm_ud(3u)); 1753 channel_mask = intexp2(fwa_bld, channel); 1754 /* Then the channel masks need to be in bits 23:16. */ 1755 fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u)); 1756 } 1757 1758 /* Store the control data bits in the message payload and send it. */ 1759 int mlen = 2; 1760 if (channel_mask.file != BAD_FILE) 1761 mlen += 4; /* channel masks, plus 3 extra copies of the data */ 1762 if (per_slot_offset.file != BAD_FILE) 1763 mlen++; 1764 1765 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen); 1766 fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen); 1767 int i = 0; 1768 sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); 1769 if (per_slot_offset.file != BAD_FILE) 1770 sources[i++] = per_slot_offset; 1771 if (channel_mask.file != BAD_FILE) 1772 sources[i++] = channel_mask; 1773 while (i < mlen) { 1774 sources[i++] = this->control_data_bits; 1775 } 1776 1777 abld.LOAD_PAYLOAD(payload, sources, mlen, mlen); 1778 fs_inst *inst = abld.emit(opcode, reg_undef, payload); 1779 inst->mlen = mlen; 1780 /* We need to increment Global Offset by 256-bits to make room for 1781 * Broadwell's extra "Vertex Count" payload at the beginning of the 1782 * URB entry. Since this is an OWord message, Global Offset is counted 1783 * in 128-bit units, so we must set it to 2. 1784 */ 1785 if (gs_prog_data->static_vertex_count == -1) 1786 inst->offset = 2; 1787} 1788 1789void 1790fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count, 1791 unsigned stream_id) 1792{ 1793 /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */ 1794 1795 /* Note: we are calling this *before* increasing vertex_count, so 1796 * this->vertex_count == vertex_count - 1 in the formula above. 1797 */ 1798 1799 /* Stream mode uses 2 bits per vertex */ 1800 assert(gs_compile->control_data_bits_per_vertex == 2); 1801 1802 /* Must be a valid stream */ 1803 assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS); 1804 1805 /* Control data bits are initialized to 0 so we don't have to set any 1806 * bits when sending vertices to stream 0. 1807 */ 1808 if (stream_id == 0) 1809 return; 1810 1811 const fs_builder abld = bld.annotate("set stream control data bits", NULL); 1812 1813 /* reg::sid = stream_id */ 1814 fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 1815 abld.MOV(sid, brw_imm_ud(stream_id)); 1816 1817 /* reg:shift_count = 2 * (vertex_count - 1) */ 1818 fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 1819 abld.SHL(shift_count, vertex_count, brw_imm_ud(1u)); 1820 1821 /* Note: we're relying on the fact that the GEN SHL instruction only pays 1822 * attention to the lower 5 bits of its second source argument, so on this 1823 * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to 1824 * stream_id << ((2 * (vertex_count - 1)) % 32). 1825 */ 1826 fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 1827 abld.SHL(mask, sid, shift_count); 1828 abld.OR(this->control_data_bits, this->control_data_bits, mask); 1829} 1830 1831void 1832fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src, 1833 unsigned stream_id) 1834{ 1835 assert(stage == MESA_SHADER_GEOMETRY); 1836 1837 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 1838 1839 fs_reg vertex_count = get_nir_src(vertex_count_nir_src); 1840 vertex_count.type = BRW_REGISTER_TYPE_UD; 1841 1842 /* Haswell and later hardware ignores the "Render Stream Select" bits 1843 * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled, 1844 * and instead sends all primitives down the pipeline for rasterization. 1845 * If the SOL stage is enabled, "Render Stream Select" is honored and 1846 * primitives bound to non-zero streams are discarded after stream output. 1847 * 1848 * Since the only purpose of primives sent to non-zero streams is to 1849 * be recorded by transform feedback, we can simply discard all geometry 1850 * bound to these streams when transform feedback is disabled. 1851 */ 1852 if (stream_id > 0 && !nir->info->has_transform_feedback_varyings) 1853 return; 1854 1855 /* If we're outputting 32 control data bits or less, then we can wait 1856 * until the shader is over to output them all. Otherwise we need to 1857 * output them as we go. Now is the time to do it, since we're about to 1858 * output the vertex_count'th vertex, so it's guaranteed that the 1859 * control data bits associated with the (vertex_count - 1)th vertex are 1860 * correct. 1861 */ 1862 if (gs_compile->control_data_header_size_bits > 32) { 1863 const fs_builder abld = 1864 bld.annotate("emit vertex: emit control data bits"); 1865 1866 /* Only emit control data bits if we've finished accumulating a batch 1867 * of 32 bits. This is the case when: 1868 * 1869 * (vertex_count * bits_per_vertex) % 32 == 0 1870 * 1871 * (in other words, when the last 5 bits of vertex_count * 1872 * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some 1873 * integer n (which is always the case, since bits_per_vertex is 1874 * always 1 or 2), this is equivalent to requiring that the last 5-n 1875 * bits of vertex_count are 0: 1876 * 1877 * vertex_count & (2^(5-n) - 1) == 0 1878 * 1879 * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is 1880 * equivalent to: 1881 * 1882 * vertex_count & (32 / bits_per_vertex - 1) == 0 1883 * 1884 * TODO: If vertex_count is an immediate, we could do some of this math 1885 * at compile time... 1886 */ 1887 fs_inst *inst = 1888 abld.AND(bld.null_reg_d(), vertex_count, 1889 brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u)); 1890 inst->conditional_mod = BRW_CONDITIONAL_Z; 1891 1892 abld.IF(BRW_PREDICATE_NORMAL); 1893 /* If vertex_count is 0, then no control data bits have been 1894 * accumulated yet, so we can skip emitting them. 1895 */ 1896 abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u), 1897 BRW_CONDITIONAL_NEQ); 1898 abld.IF(BRW_PREDICATE_NORMAL); 1899 emit_gs_control_data_bits(vertex_count); 1900 abld.emit(BRW_OPCODE_ENDIF); 1901 1902 /* Reset control_data_bits to 0 so we can start accumulating a new 1903 * batch. 1904 * 1905 * Note: in the case where vertex_count == 0, this neutralizes the 1906 * effect of any call to EndPrimitive() that the shader may have 1907 * made before outputting its first vertex. 1908 */ 1909 inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u)); 1910 inst->force_writemask_all = true; 1911 abld.emit(BRW_OPCODE_ENDIF); 1912 } 1913 1914 emit_urb_writes(vertex_count); 1915 1916 /* In stream mode we have to set control data bits for all vertices 1917 * unless we have disabled control data bits completely (which we do 1918 * do for GL_POINTS outputs that don't use streams). 1919 */ 1920 if (gs_compile->control_data_header_size_bits > 0 && 1921 gs_prog_data->control_data_format == 1922 GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) { 1923 set_gs_stream_control_data_bits(vertex_count, stream_id); 1924 } 1925} 1926 1927void 1928fs_visitor::emit_gs_input_load(const fs_reg &dst, 1929 const nir_src &vertex_src, 1930 unsigned base_offset, 1931 const nir_src &offset_src, 1932 unsigned num_components, 1933 unsigned first_component) 1934{ 1935 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 1936 1937 nir_const_value *vertex_const = nir_src_as_const_value(vertex_src); 1938 nir_const_value *offset_const = nir_src_as_const_value(offset_src); 1939 const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8; 1940 1941 /* Offset 0 is the VUE header, which contains VARYING_SLOT_LAYER [.y], 1942 * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w]. Only 1943 * gl_PointSize is available as a GS input, however, so it must be that. 1944 */ 1945 const bool is_point_size = (base_offset == 0); 1946 1947 /* TODO: figure out push input layout for invocations == 1 */ 1948 if (gs_prog_data->invocations == 1 && 1949 offset_const != NULL && vertex_const != NULL && 1950 4 * (base_offset + offset_const->u32[0]) < push_reg_count) { 1951 int imm_offset = (base_offset + offset_const->u32[0]) * 4 + 1952 vertex_const->u32[0] * push_reg_count; 1953 /* This input was pushed into registers. */ 1954 if (is_point_size) { 1955 /* gl_PointSize comes in .w */ 1956 bld.MOV(dst, fs_reg(ATTR, imm_offset + 3, dst.type)); 1957 } else { 1958 for (unsigned i = 0; i < num_components; i++) { 1959 bld.MOV(offset(dst, bld, i), 1960 fs_reg(ATTR, imm_offset + i + first_component, dst.type)); 1961 } 1962 } 1963 return; 1964 } 1965 1966 /* Resort to the pull model. Ensure the VUE handles are provided. */ 1967 gs_prog_data->base.include_vue_handles = true; 1968 1969 unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2; 1970 fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 1971 1972 if (gs_prog_data->invocations == 1) { 1973 if (vertex_const) { 1974 /* The vertex index is constant; just select the proper URB handle. */ 1975 icp_handle = 1976 retype(brw_vec8_grf(first_icp_handle + vertex_const->i32[0], 0), 1977 BRW_REGISTER_TYPE_UD); 1978 } else { 1979 /* The vertex index is non-constant. We need to use indirect 1980 * addressing to fetch the proper URB handle. 1981 * 1982 * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0> 1983 * indicating that channel <n> should read the handle from 1984 * DWord <n>. We convert that to bytes by multiplying by 4. 1985 * 1986 * Next, we convert the vertex index to bytes by multiplying 1987 * by 32 (shifting by 5), and add the two together. This is 1988 * the final indirect byte offset. 1989 */ 1990 fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_W, 1); 1991 fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 1992 fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 1993 fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 1994 1995 /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */ 1996 bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210))); 1997 /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */ 1998 bld.SHL(channel_offsets, sequence, brw_imm_ud(2u)); 1999 /* Convert vertex_index to bytes (multiply by 32) */ 2000 bld.SHL(vertex_offset_bytes, 2001 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), 2002 brw_imm_ud(5u)); 2003 bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets); 2004 2005 /* Use first_icp_handle as the base offset. There is one register 2006 * of URB handles per vertex, so inform the register allocator that 2007 * we might read up to nir->info->gs.vertices_in registers. 2008 */ 2009 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, 2010 retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type), 2011 fs_reg(icp_offset_bytes), 2012 brw_imm_ud(nir->info->gs.vertices_in * REG_SIZE)); 2013 } 2014 } else { 2015 assert(gs_prog_data->invocations > 1); 2016 2017 if (vertex_const) { 2018 assert(devinfo->gen >= 9 || vertex_const->i32[0] <= 5); 2019 bld.MOV(icp_handle, 2020 retype(brw_vec1_grf(first_icp_handle + 2021 vertex_const->i32[0] / 8, 2022 vertex_const->i32[0] % 8), 2023 BRW_REGISTER_TYPE_UD)); 2024 } else { 2025 /* The vertex index is non-constant. We need to use indirect 2026 * addressing to fetch the proper URB handle. 2027 * 2028 */ 2029 fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2030 2031 /* Convert vertex_index to bytes (multiply by 4) */ 2032 bld.SHL(icp_offset_bytes, 2033 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), 2034 brw_imm_ud(2u)); 2035 2036 /* Use first_icp_handle as the base offset. There is one DWord 2037 * of URB handles per vertex, so inform the register allocator that 2038 * we might read up to ceil(nir->info->gs.vertices_in / 8) registers. 2039 */ 2040 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, 2041 retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type), 2042 fs_reg(icp_offset_bytes), 2043 brw_imm_ud(DIV_ROUND_UP(nir->info->gs.vertices_in, 8) * 2044 REG_SIZE)); 2045 } 2046 } 2047 2048 fs_inst *inst; 2049 2050 fs_reg tmp_dst = dst; 2051 fs_reg indirect_offset = get_nir_src(offset_src); 2052 unsigned num_iterations = 1; 2053 unsigned orig_num_components = num_components; 2054 2055 if (type_sz(dst.type) == 8) { 2056 if (num_components > 2) { 2057 num_iterations = 2; 2058 num_components = 2; 2059 } 2060 fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type); 2061 tmp_dst = tmp; 2062 first_component = first_component / 2; 2063 } 2064 2065 for (unsigned iter = 0; iter < num_iterations; iter++) { 2066 if (offset_const) { 2067 /* Constant indexing - use global offset. */ 2068 if (first_component != 0) { 2069 unsigned read_components = num_components + first_component; 2070 fs_reg tmp = bld.vgrf(dst.type, read_components); 2071 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle); 2072 inst->size_written = read_components * 2073 tmp.component_size(inst->exec_size); 2074 for (unsigned i = 0; i < num_components; i++) { 2075 bld.MOV(offset(tmp_dst, bld, i), 2076 offset(tmp, bld, i + first_component)); 2077 } 2078 } else { 2079 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp_dst, 2080 icp_handle); 2081 inst->size_written = num_components * 2082 tmp_dst.component_size(inst->exec_size); 2083 } 2084 inst->offset = base_offset + offset_const->u32[0]; 2085 inst->mlen = 1; 2086 } else { 2087 /* Indirect indexing - use per-slot offsets as well. */ 2088 const fs_reg srcs[] = { icp_handle, indirect_offset }; 2089 unsigned read_components = num_components + first_component; 2090 fs_reg tmp = bld.vgrf(dst.type, read_components); 2091 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 2092 bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); 2093 if (first_component != 0) { 2094 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, 2095 payload); 2096 inst->size_written = read_components * 2097 tmp.component_size(inst->exec_size); 2098 for (unsigned i = 0; i < num_components; i++) { 2099 bld.MOV(offset(tmp_dst, bld, i), 2100 offset(tmp, bld, i + first_component)); 2101 } 2102 } else { 2103 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp_dst, 2104 payload); 2105 inst->size_written = num_components * 2106 tmp_dst.component_size(inst->exec_size); 2107 } 2108 inst->offset = base_offset; 2109 inst->mlen = 2; 2110 } 2111 2112 if (type_sz(dst.type) == 8) { 2113 shuffle_32bit_load_result_to_64bit_data( 2114 bld, tmp_dst, retype(tmp_dst, BRW_REGISTER_TYPE_F), num_components); 2115 2116 for (unsigned c = 0; c < num_components; c++) 2117 bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld, c)); 2118 } 2119 2120 if (num_iterations > 1) { 2121 num_components = orig_num_components - 2; 2122 if(offset_const) { 2123 base_offset++; 2124 } else { 2125 fs_reg new_indirect = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2126 bld.ADD(new_indirect, indirect_offset, brw_imm_ud(1u)); 2127 indirect_offset = new_indirect; 2128 } 2129 } 2130 } 2131 2132 if (is_point_size) { 2133 /* Read the whole VUE header (because of alignment) and read .w. */ 2134 fs_reg tmp = bld.vgrf(dst.type, 4); 2135 inst->dst = tmp; 2136 inst->size_written = 4 * REG_SIZE; 2137 bld.MOV(dst, offset(tmp, bld, 3)); 2138 } 2139} 2140 2141fs_reg 2142fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr) 2143{ 2144 nir_src *offset_src = nir_get_io_offset_src(instr); 2145 nir_const_value *const_value = nir_src_as_const_value(*offset_src); 2146 2147 if (const_value) { 2148 /* The only constant offset we should find is 0. brw_nir.c's 2149 * add_const_offset_to_base() will fold other constant offsets 2150 * into instr->const_index[0]. 2151 */ 2152 assert(const_value->u32[0] == 0); 2153 return fs_reg(); 2154 } 2155 2156 return get_nir_src(*offset_src); 2157} 2158 2159static void 2160do_untyped_vector_read(const fs_builder &bld, 2161 const fs_reg dest, 2162 const fs_reg surf_index, 2163 const fs_reg offset_reg, 2164 unsigned num_components) 2165{ 2166 if (type_sz(dest.type) == 4) { 2167 fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg, 2168 1 /* dims */, 2169 num_components, 2170 BRW_PREDICATE_NONE); 2171 read_result.type = dest.type; 2172 for (unsigned i = 0; i < num_components; i++) 2173 bld.MOV(offset(dest, bld, i), offset(read_result, bld, i)); 2174 } else if (type_sz(dest.type) == 8) { 2175 /* Reading a dvec, so we need to: 2176 * 2177 * 1. Multiply num_components by 2, to account for the fact that we 2178 * need to read 64-bit components. 2179 * 2. Shuffle the result of the load to form valid 64-bit elements 2180 * 3. Emit a second load (for components z/w) if needed. 2181 */ 2182 fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD); 2183 bld.MOV(read_offset, offset_reg); 2184 2185 int iters = num_components <= 2 ? 1 : 2; 2186 2187 /* Load the dvec, the first iteration loads components x/y, the second 2188 * iteration, if needed, loads components z/w 2189 */ 2190 for (int it = 0; it < iters; it++) { 2191 /* Compute number of components to read in this iteration */ 2192 int iter_components = MIN2(2, num_components); 2193 num_components -= iter_components; 2194 2195 /* Read. Since this message reads 32-bit components, we need to 2196 * read twice as many components. 2197 */ 2198 fs_reg read_result = emit_untyped_read(bld, surf_index, read_offset, 2199 1 /* dims */, 2200 iter_components * 2, 2201 BRW_PREDICATE_NONE); 2202 2203 /* Shuffle the 32-bit load result into valid 64-bit data */ 2204 const fs_reg packed_result = bld.vgrf(dest.type, iter_components); 2205 shuffle_32bit_load_result_to_64bit_data( 2206 bld, packed_result, read_result, iter_components); 2207 2208 /* Move each component to its destination */ 2209 read_result = retype(read_result, BRW_REGISTER_TYPE_DF); 2210 for (int c = 0; c < iter_components; c++) { 2211 bld.MOV(offset(dest, bld, it * 2 + c), 2212 offset(packed_result, bld, c)); 2213 } 2214 2215 bld.ADD(read_offset, read_offset, brw_imm_ud(16)); 2216 } 2217 } else { 2218 unreachable("Unsupported type"); 2219 } 2220} 2221 2222void 2223fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld, 2224 nir_intrinsic_instr *instr) 2225{ 2226 assert(stage == MESA_SHADER_VERTEX); 2227 2228 fs_reg dest; 2229 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 2230 dest = get_nir_dest(instr->dest); 2231 2232 switch (instr->intrinsic) { 2233 case nir_intrinsic_load_vertex_id: 2234 unreachable("should be lowered by lower_vertex_id()"); 2235 2236 case nir_intrinsic_load_vertex_id_zero_base: 2237 case nir_intrinsic_load_base_vertex: 2238 case nir_intrinsic_load_instance_id: 2239 case nir_intrinsic_load_base_instance: 2240 case nir_intrinsic_load_draw_id: { 2241 gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); 2242 fs_reg val = nir_system_values[sv]; 2243 assert(val.file != BAD_FILE); 2244 dest.type = val.type; 2245 bld.MOV(dest, val); 2246 break; 2247 } 2248 2249 case nir_intrinsic_load_input: { 2250 fs_reg src = fs_reg(ATTR, instr->const_index[0], dest.type); 2251 unsigned first_component = nir_intrinsic_component(instr); 2252 unsigned num_components = instr->num_components; 2253 enum brw_reg_type type = dest.type; 2254 2255 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); 2256 assert(const_offset && "Indirect input loads not allowed"); 2257 src = offset(src, bld, const_offset->u32[0]); 2258 2259 for (unsigned j = 0; j < num_components; j++) { 2260 bld.MOV(offset(dest, bld, j), offset(src, bld, j + first_component)); 2261 } 2262 2263 if (type == BRW_REGISTER_TYPE_DF) { 2264 /* Once the double vector is read, set again its original register 2265 * type to continue with normal execution. 2266 */ 2267 src = retype(src, type); 2268 dest = retype(dest, type); 2269 } 2270 2271 if (type_sz(src.type) == 8) { 2272 shuffle_32bit_load_result_to_64bit_data(bld, 2273 dest, 2274 retype(dest, BRW_REGISTER_TYPE_F), 2275 instr->num_components); 2276 } 2277 break; 2278 } 2279 2280 default: 2281 nir_emit_intrinsic(bld, instr); 2282 break; 2283 } 2284} 2285 2286void 2287fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, 2288 nir_intrinsic_instr *instr) 2289{ 2290 assert(stage == MESA_SHADER_TESS_CTRL); 2291 struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key; 2292 struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); 2293 2294 fs_reg dst; 2295 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 2296 dst = get_nir_dest(instr->dest); 2297 2298 switch (instr->intrinsic) { 2299 case nir_intrinsic_load_primitive_id: 2300 bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1))); 2301 break; 2302 case nir_intrinsic_load_invocation_id: 2303 bld.MOV(retype(dst, invocation_id.type), invocation_id); 2304 break; 2305 case nir_intrinsic_load_patch_vertices_in: 2306 bld.MOV(retype(dst, BRW_REGISTER_TYPE_D), 2307 brw_imm_d(tcs_key->input_vertices)); 2308 break; 2309 2310 case nir_intrinsic_barrier: { 2311 if (tcs_prog_data->instances == 1) 2312 break; 2313 2314 fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2315 fs_reg m0_2 = component(m0, 2); 2316 2317 const fs_builder chanbld = bld.exec_all().group(1, 0); 2318 2319 /* Zero the message header */ 2320 bld.exec_all().MOV(m0, brw_imm_ud(0u)); 2321 2322 /* Copy "Barrier ID" from r0.2, bits 16:13 */ 2323 chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), 2324 brw_imm_ud(INTEL_MASK(16, 13))); 2325 2326 /* Shift it up to bits 27:24. */ 2327 chanbld.SHL(m0_2, m0_2, brw_imm_ud(11)); 2328 2329 /* Set the Barrier Count and the enable bit */ 2330 chanbld.OR(m0_2, m0_2, 2331 brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15))); 2332 2333 bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0); 2334 break; 2335 } 2336 2337 case nir_intrinsic_load_input: 2338 unreachable("nir_lower_io should never give us these."); 2339 break; 2340 2341 case nir_intrinsic_load_per_vertex_input: { 2342 fs_reg indirect_offset = get_indirect_offset(instr); 2343 unsigned imm_offset = instr->const_index[0]; 2344 2345 const nir_src &vertex_src = instr->src[0]; 2346 nir_const_value *vertex_const = nir_src_as_const_value(vertex_src); 2347 2348 fs_inst *inst; 2349 2350 fs_reg icp_handle; 2351 2352 if (vertex_const) { 2353 /* Emit a MOV to resolve <0,1,0> regioning. */ 2354 icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2355 bld.MOV(icp_handle, 2356 retype(brw_vec1_grf(1 + (vertex_const->i32[0] >> 3), 2357 vertex_const->i32[0] & 7), 2358 BRW_REGISTER_TYPE_UD)); 2359 } else if (tcs_prog_data->instances == 1 && 2360 vertex_src.is_ssa && 2361 vertex_src.ssa->parent_instr->type == nir_instr_type_intrinsic && 2362 nir_instr_as_intrinsic(vertex_src.ssa->parent_instr)->intrinsic == nir_intrinsic_load_invocation_id) { 2363 /* For the common case of only 1 instance, an array index of 2364 * gl_InvocationID means reading g1. Skip all the indirect work. 2365 */ 2366 icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD); 2367 } else { 2368 /* The vertex index is non-constant. We need to use indirect 2369 * addressing to fetch the proper URB handle. 2370 */ 2371 icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2372 2373 /* Each ICP handle is a single DWord (4 bytes) */ 2374 fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2375 bld.SHL(vertex_offset_bytes, 2376 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), 2377 brw_imm_ud(2u)); 2378 2379 /* Start at g1. We might read up to 4 registers. */ 2380 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, 2381 retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes, 2382 brw_imm_ud(4 * REG_SIZE)); 2383 } 2384 2385 /* We can only read two double components with each URB read, so 2386 * we send two read messages in that case, each one loading up to 2387 * two double components. 2388 */ 2389 unsigned num_iterations = 1; 2390 unsigned num_components = instr->num_components; 2391 unsigned first_component = nir_intrinsic_component(instr); 2392 fs_reg orig_dst = dst; 2393 if (type_sz(dst.type) == 8) { 2394 first_component = first_component / 2; 2395 if (instr->num_components > 2) { 2396 num_iterations = 2; 2397 num_components = 2; 2398 } 2399 2400 fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type); 2401 dst = tmp; 2402 } 2403 2404 for (unsigned iter = 0; iter < num_iterations; iter++) { 2405 if (indirect_offset.file == BAD_FILE) { 2406 /* Constant indexing - use global offset. */ 2407 if (first_component != 0) { 2408 unsigned read_components = num_components + first_component; 2409 fs_reg tmp = bld.vgrf(dst.type, read_components); 2410 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle); 2411 for (unsigned i = 0; i < num_components; i++) { 2412 bld.MOV(offset(dst, bld, i), 2413 offset(tmp, bld, i + first_component)); 2414 } 2415 } else { 2416 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle); 2417 } 2418 inst->offset = imm_offset; 2419 inst->mlen = 1; 2420 } else { 2421 /* Indirect indexing - use per-slot offsets as well. */ 2422 const fs_reg srcs[] = { icp_handle, indirect_offset }; 2423 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 2424 bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); 2425 if (first_component != 0) { 2426 unsigned read_components = num_components + first_component; 2427 fs_reg tmp = bld.vgrf(dst.type, read_components); 2428 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, 2429 payload); 2430 for (unsigned i = 0; i < num_components; i++) { 2431 bld.MOV(offset(dst, bld, i), 2432 offset(tmp, bld, i + first_component)); 2433 } 2434 } else { 2435 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, 2436 payload); 2437 } 2438 inst->offset = imm_offset; 2439 inst->mlen = 2; 2440 } 2441 inst->size_written = (num_components + first_component) * 2442 inst->dst.component_size(inst->exec_size); 2443 2444 /* If we are reading 64-bit data using 32-bit read messages we need 2445 * build proper 64-bit data elements by shuffling the low and high 2446 * 32-bit components around like we do for other things like UBOs 2447 * or SSBOs. 2448 */ 2449 if (type_sz(dst.type) == 8) { 2450 shuffle_32bit_load_result_to_64bit_data( 2451 bld, dst, retype(dst, BRW_REGISTER_TYPE_F), num_components); 2452 2453 for (unsigned c = 0; c < num_components; c++) { 2454 bld.MOV(offset(orig_dst, bld, iter * 2 + c), 2455 offset(dst, bld, c)); 2456 } 2457 } 2458 2459 /* Copy the temporary to the destination to deal with writemasking. 2460 * 2461 * Also attempt to deal with gl_PointSize being in the .w component. 2462 */ 2463 if (inst->offset == 0 && indirect_offset.file == BAD_FILE) { 2464 assert(type_sz(dst.type) < 8); 2465 inst->dst = bld.vgrf(dst.type, 4); 2466 inst->size_written = 4 * REG_SIZE; 2467 bld.MOV(dst, offset(inst->dst, bld, 3)); 2468 } 2469 2470 /* If we are loading double data and we need a second read message 2471 * adjust the write offset 2472 */ 2473 if (num_iterations > 1) { 2474 num_components = instr->num_components - 2; 2475 imm_offset++; 2476 } 2477 } 2478 break; 2479 } 2480 2481 case nir_intrinsic_load_output: 2482 case nir_intrinsic_load_per_vertex_output: { 2483 fs_reg indirect_offset = get_indirect_offset(instr); 2484 unsigned imm_offset = instr->const_index[0]; 2485 unsigned first_component = nir_intrinsic_component(instr); 2486 2487 fs_inst *inst; 2488 if (indirect_offset.file == BAD_FILE) { 2489 /* Replicate the patch handle to all enabled channels */ 2490 fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2491 bld.MOV(patch_handle, 2492 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)); 2493 2494 { 2495 if (first_component != 0) { 2496 unsigned read_components = 2497 instr->num_components + first_component; 2498 fs_reg tmp = bld.vgrf(dst.type, read_components); 2499 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, 2500 patch_handle); 2501 inst->size_written = read_components * REG_SIZE; 2502 for (unsigned i = 0; i < instr->num_components; i++) { 2503 bld.MOV(offset(dst, bld, i), 2504 offset(tmp, bld, i + first_component)); 2505 } 2506 } else { 2507 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, 2508 patch_handle); 2509 inst->size_written = instr->num_components * REG_SIZE; 2510 } 2511 inst->offset = imm_offset; 2512 inst->mlen = 1; 2513 } 2514 } else { 2515 /* Indirect indexing - use per-slot offsets as well. */ 2516 const fs_reg srcs[] = { 2517 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), 2518 indirect_offset 2519 }; 2520 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 2521 bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); 2522 if (first_component != 0) { 2523 unsigned read_components = 2524 instr->num_components + first_component; 2525 fs_reg tmp = bld.vgrf(dst.type, read_components); 2526 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, 2527 payload); 2528 inst->size_written = read_components * REG_SIZE; 2529 for (unsigned i = 0; i < instr->num_components; i++) { 2530 bld.MOV(offset(dst, bld, i), 2531 offset(tmp, bld, i + first_component)); 2532 } 2533 } else { 2534 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, 2535 payload); 2536 inst->size_written = instr->num_components * REG_SIZE; 2537 } 2538 inst->offset = imm_offset; 2539 inst->mlen = 2; 2540 } 2541 break; 2542 } 2543 2544 case nir_intrinsic_store_output: 2545 case nir_intrinsic_store_per_vertex_output: { 2546 fs_reg value = get_nir_src(instr->src[0]); 2547 bool is_64bit = (instr->src[0].is_ssa ? 2548 instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64; 2549 fs_reg indirect_offset = get_indirect_offset(instr); 2550 unsigned imm_offset = instr->const_index[0]; 2551 unsigned swiz = BRW_SWIZZLE_XYZW; 2552 unsigned mask = instr->const_index[1]; 2553 unsigned header_regs = 0; 2554 fs_reg srcs[7]; 2555 srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD); 2556 2557 if (indirect_offset.file != BAD_FILE) { 2558 srcs[header_regs++] = indirect_offset; 2559 } 2560 2561 if (mask == 0) 2562 break; 2563 2564 unsigned num_components = util_last_bit(mask); 2565 enum opcode opcode; 2566 2567 /* We can only pack two 64-bit components in a single message, so send 2568 * 2 messages if we have more components 2569 */ 2570 unsigned num_iterations = 1; 2571 unsigned iter_components = num_components; 2572 unsigned first_component = nir_intrinsic_component(instr); 2573 if (is_64bit) { 2574 first_component = first_component / 2; 2575 if (instr->num_components > 2) { 2576 num_iterations = 2; 2577 iter_components = 2; 2578 } 2579 } 2580 2581 /* 64-bit data needs to me shuffled before we can write it to the URB. 2582 * We will use this temporary to shuffle the components in each 2583 * iteration. 2584 */ 2585 fs_reg tmp = 2586 fs_reg(VGRF, alloc.allocate(2 * iter_components), value.type); 2587 2588 mask = mask << first_component; 2589 2590 for (unsigned iter = 0; iter < num_iterations; iter++) { 2591 if (!is_64bit && mask != WRITEMASK_XYZW) { 2592 srcs[header_regs++] = brw_imm_ud(mask << 16); 2593 opcode = indirect_offset.file != BAD_FILE ? 2594 SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT : 2595 SHADER_OPCODE_URB_WRITE_SIMD8_MASKED; 2596 } else if (is_64bit && ((mask & WRITEMASK_XY) != WRITEMASK_XY)) { 2597 /* Expand the 64-bit mask to 32-bit channels. We only handle 2598 * two channels in each iteration, so we only care about X/Y. 2599 */ 2600 unsigned mask32 = 0; 2601 if (mask & WRITEMASK_X) 2602 mask32 |= WRITEMASK_XY; 2603 if (mask & WRITEMASK_Y) 2604 mask32 |= WRITEMASK_ZW; 2605 2606 /* If the mask does not include any of the channels X or Y there 2607 * is nothing to do in this iteration. Move on to the next couple 2608 * of 64-bit channels. 2609 */ 2610 if (!mask32) { 2611 mask >>= 2; 2612 imm_offset++; 2613 continue; 2614 } 2615 2616 srcs[header_regs++] = brw_imm_ud(mask32 << 16); 2617 opcode = indirect_offset.file != BAD_FILE ? 2618 SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT : 2619 SHADER_OPCODE_URB_WRITE_SIMD8_MASKED; 2620 } else { 2621 opcode = indirect_offset.file != BAD_FILE ? 2622 SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT : 2623 SHADER_OPCODE_URB_WRITE_SIMD8; 2624 } 2625 2626 for (unsigned i = 0; i < iter_components; i++) { 2627 if (!(mask & (1 << (i + first_component)))) 2628 continue; 2629 2630 if (!is_64bit) { 2631 srcs[header_regs + i + first_component] = 2632 offset(value, bld, BRW_GET_SWZ(swiz, i)); 2633 } else { 2634 /* We need to shuffle the 64-bit data to match the layout 2635 * expected by our 32-bit URB write messages. We use a temporary 2636 * for that. 2637 */ 2638 unsigned channel = BRW_GET_SWZ(swiz, iter * 2 + i); 2639 shuffle_64bit_data_for_32bit_write(bld, 2640 retype(offset(tmp, bld, 2 * i), BRW_REGISTER_TYPE_F), 2641 retype(offset(value, bld, 2 * channel), BRW_REGISTER_TYPE_DF), 2642 1); 2643 2644 /* Now copy the data to the destination */ 2645 fs_reg dest = fs_reg(VGRF, alloc.allocate(2), value.type); 2646 unsigned idx = 2 * i; 2647 bld.MOV(dest, offset(tmp, bld, idx)); 2648 bld.MOV(offset(dest, bld, 1), offset(tmp, bld, idx + 1)); 2649 srcs[header_regs + idx + first_component * 2] = dest; 2650 srcs[header_regs + idx + 1 + first_component * 2] = 2651 offset(dest, bld, 1); 2652 } 2653 } 2654 2655 unsigned mlen = 2656 header_regs + (is_64bit ? 2 * iter_components : iter_components) + 2657 (is_64bit ? 2 * first_component : first_component); 2658 fs_reg payload = 2659 bld.vgrf(BRW_REGISTER_TYPE_UD, mlen); 2660 bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs); 2661 2662 fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload); 2663 inst->offset = imm_offset; 2664 inst->mlen = mlen; 2665 2666 /* If this is a 64-bit attribute, select the next two 64-bit channels 2667 * to be handled in the next iteration. 2668 */ 2669 if (is_64bit) { 2670 mask >>= 2; 2671 imm_offset++; 2672 } 2673 } 2674 break; 2675 } 2676 2677 default: 2678 nir_emit_intrinsic(bld, instr); 2679 break; 2680 } 2681} 2682 2683void 2684fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld, 2685 nir_intrinsic_instr *instr) 2686{ 2687 assert(stage == MESA_SHADER_TESS_EVAL); 2688 struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data); 2689 2690 fs_reg dest; 2691 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 2692 dest = get_nir_dest(instr->dest); 2693 2694 switch (instr->intrinsic) { 2695 case nir_intrinsic_load_primitive_id: 2696 bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1))); 2697 break; 2698 case nir_intrinsic_load_tess_coord: 2699 /* gl_TessCoord is part of the payload in g1-3 */ 2700 for (unsigned i = 0; i < 3; i++) { 2701 bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0))); 2702 } 2703 break; 2704 2705 case nir_intrinsic_load_input: 2706 case nir_intrinsic_load_per_vertex_input: { 2707 fs_reg indirect_offset = get_indirect_offset(instr); 2708 unsigned imm_offset = instr->const_index[0]; 2709 unsigned first_component = nir_intrinsic_component(instr); 2710 2711 if (type_sz(dest.type) == 8) { 2712 first_component = first_component / 2; 2713 } 2714 2715 fs_inst *inst; 2716 if (indirect_offset.file == BAD_FILE) { 2717 /* Arbitrarily only push up to 32 vec4 slots worth of data, 2718 * which is 16 registers (since each holds 2 vec4 slots). 2719 */ 2720 const unsigned max_push_slots = 32; 2721 if (imm_offset < max_push_slots) { 2722 fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type); 2723 for (int i = 0; i < instr->num_components; i++) { 2724 unsigned comp = 16 / type_sz(dest.type) * (imm_offset % 2) + 2725 i + first_component; 2726 bld.MOV(offset(dest, bld, i), component(src, comp)); 2727 } 2728 tes_prog_data->base.urb_read_length = 2729 MAX2(tes_prog_data->base.urb_read_length, 2730 DIV_ROUND_UP(imm_offset + 1, 2)); 2731 } else { 2732 /* Replicate the patch handle to all enabled channels */ 2733 const fs_reg srcs[] = { 2734 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD) 2735 }; 2736 fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2737 bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0); 2738 2739 if (first_component != 0) { 2740 unsigned read_components = 2741 instr->num_components + first_component; 2742 fs_reg tmp = bld.vgrf(dest.type, read_components); 2743 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, 2744 patch_handle); 2745 inst->size_written = read_components * REG_SIZE; 2746 for (unsigned i = 0; i < instr->num_components; i++) { 2747 bld.MOV(offset(dest, bld, i), 2748 offset(tmp, bld, i + first_component)); 2749 } 2750 } else { 2751 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest, 2752 patch_handle); 2753 inst->size_written = instr->num_components * REG_SIZE; 2754 } 2755 inst->mlen = 1; 2756 inst->offset = imm_offset; 2757 } 2758 } else { 2759 /* Indirect indexing - use per-slot offsets as well. */ 2760 2761 /* We can only read two double components with each URB read, so 2762 * we send two read messages in that case, each one loading up to 2763 * two double components. 2764 */ 2765 unsigned num_iterations = 1; 2766 unsigned num_components = instr->num_components; 2767 fs_reg orig_dest = dest; 2768 if (type_sz(dest.type) == 8) { 2769 if (instr->num_components > 2) { 2770 num_iterations = 2; 2771 num_components = 2; 2772 } 2773 fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dest.type); 2774 dest = tmp; 2775 } 2776 2777 for (unsigned iter = 0; iter < num_iterations; iter++) { 2778 const fs_reg srcs[] = { 2779 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), 2780 indirect_offset 2781 }; 2782 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 2783 bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); 2784 2785 if (first_component != 0) { 2786 unsigned read_components = 2787 num_components + first_component; 2788 fs_reg tmp = bld.vgrf(dest.type, read_components); 2789 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, 2790 payload); 2791 for (unsigned i = 0; i < num_components; i++) { 2792 bld.MOV(offset(dest, bld, i), 2793 offset(tmp, bld, i + first_component)); 2794 } 2795 } else { 2796 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest, 2797 payload); 2798 } 2799 inst->mlen = 2; 2800 inst->offset = imm_offset; 2801 inst->size_written = (num_components + first_component) * 2802 inst->dst.component_size(inst->exec_size); 2803 2804 /* If we are reading 64-bit data using 32-bit read messages we need 2805 * build proper 64-bit data elements by shuffling the low and high 2806 * 32-bit components around like we do for other things like UBOs 2807 * or SSBOs. 2808 */ 2809 if (type_sz(dest.type) == 8) { 2810 shuffle_32bit_load_result_to_64bit_data( 2811 bld, dest, retype(dest, BRW_REGISTER_TYPE_F), num_components); 2812 2813 for (unsigned c = 0; c < num_components; c++) { 2814 bld.MOV(offset(orig_dest, bld, iter * 2 + c), 2815 offset(dest, bld, c)); 2816 } 2817 } 2818 2819 /* If we are loading double data and we need a second read message 2820 * adjust the offset 2821 */ 2822 if (num_iterations > 1) { 2823 num_components = instr->num_components - 2; 2824 imm_offset++; 2825 } 2826 } 2827 } 2828 break; 2829 } 2830 default: 2831 nir_emit_intrinsic(bld, instr); 2832 break; 2833 } 2834} 2835 2836void 2837fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld, 2838 nir_intrinsic_instr *instr) 2839{ 2840 assert(stage == MESA_SHADER_GEOMETRY); 2841 fs_reg indirect_offset; 2842 2843 fs_reg dest; 2844 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 2845 dest = get_nir_dest(instr->dest); 2846 2847 switch (instr->intrinsic) { 2848 case nir_intrinsic_load_primitive_id: 2849 assert(stage == MESA_SHADER_GEOMETRY); 2850 assert(brw_gs_prog_data(prog_data)->include_primitive_id); 2851 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), 2852 retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD)); 2853 break; 2854 2855 case nir_intrinsic_load_input: 2856 unreachable("load_input intrinsics are invalid for the GS stage"); 2857 2858 case nir_intrinsic_load_per_vertex_input: 2859 emit_gs_input_load(dest, instr->src[0], instr->const_index[0], 2860 instr->src[1], instr->num_components, 2861 nir_intrinsic_component(instr)); 2862 break; 2863 2864 case nir_intrinsic_emit_vertex_with_counter: 2865 emit_gs_vertex(instr->src[0], instr->const_index[0]); 2866 break; 2867 2868 case nir_intrinsic_end_primitive_with_counter: 2869 emit_gs_end_primitive(instr->src[0]); 2870 break; 2871 2872 case nir_intrinsic_set_vertex_count: 2873 bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0])); 2874 break; 2875 2876 case nir_intrinsic_load_invocation_id: { 2877 fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID]; 2878 assert(val.file != BAD_FILE); 2879 dest.type = val.type; 2880 bld.MOV(dest, val); 2881 break; 2882 } 2883 2884 default: 2885 nir_emit_intrinsic(bld, instr); 2886 break; 2887 } 2888} 2889 2890/** 2891 * Fetch the current render target layer index. 2892 */ 2893static fs_reg 2894fetch_render_target_array_index(const fs_builder &bld) 2895{ 2896 if (bld.shader->devinfo->gen >= 6) { 2897 /* The render target array index is provided in the thread payload as 2898 * bits 26:16 of r0.0. 2899 */ 2900 const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD); 2901 bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1), 2902 brw_imm_uw(0x7ff)); 2903 return idx; 2904 } else { 2905 /* Pre-SNB we only ever render into the first layer of the framebuffer 2906 * since layered rendering is not implemented. 2907 */ 2908 return brw_imm_ud(0); 2909 } 2910} 2911 2912/** 2913 * Fake non-coherent framebuffer read implemented using TXF to fetch from the 2914 * framebuffer at the current fragment coordinates and sample index. 2915 */ 2916fs_inst * 2917fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, 2918 unsigned target) 2919{ 2920 const struct gen_device_info *devinfo = bld.shader->devinfo; 2921 2922 assert(bld.shader->stage == MESA_SHADER_FRAGMENT); 2923 const brw_wm_prog_key *wm_key = 2924 reinterpret_cast<const brw_wm_prog_key *>(key); 2925 assert(!wm_key->coherent_fb_fetch); 2926 const struct brw_wm_prog_data *wm_prog_data = 2927 brw_wm_prog_data(stage_prog_data); 2928 2929 /* Calculate the surface index relative to the start of the texture binding 2930 * table block, since that's what the texturing messages expect. 2931 */ 2932 const unsigned surface = target + 2933 wm_prog_data->binding_table.render_target_read_start - 2934 wm_prog_data->base.binding_table.texture_start; 2935 2936 brw_mark_surface_used( 2937 bld.shader->stage_prog_data, 2938 wm_prog_data->binding_table.render_target_read_start + target); 2939 2940 /* Calculate the fragment coordinates. */ 2941 const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3); 2942 bld.MOV(offset(coords, bld, 0), pixel_x); 2943 bld.MOV(offset(coords, bld, 1), pixel_y); 2944 bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld)); 2945 2946 /* Calculate the sample index and MCS payload when multisampling. Luckily 2947 * the MCS fetch message behaves deterministically for UMS surfaces, so it 2948 * shouldn't be necessary to recompile based on whether the framebuffer is 2949 * CMS or UMS. 2950 */ 2951 if (wm_key->multisample_fbo && 2952 nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE) 2953 nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup(); 2954 2955 const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID]; 2956 const fs_reg mcs = wm_key->multisample_fbo ? 2957 emit_mcs_fetch(coords, 3, brw_imm_ud(surface)) : fs_reg(); 2958 2959 /* Use either a normal or a CMS texel fetch message depending on whether 2960 * the framebuffer is single or multisample. On SKL+ use the wide CMS 2961 * message just in case the framebuffer uses 16x multisampling, it should 2962 * be equivalent to the normal CMS fetch for lower multisampling modes. 2963 */ 2964 const opcode op = !wm_key->multisample_fbo ? SHADER_OPCODE_TXF_LOGICAL : 2965 devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W_LOGICAL : 2966 SHADER_OPCODE_TXF_CMS_LOGICAL; 2967 2968 /* Emit the instruction. */ 2969 const fs_reg srcs[] = { coords, fs_reg(), brw_imm_ud(0), fs_reg(), 2970 sample, mcs, 2971 brw_imm_ud(surface), brw_imm_ud(0), 2972 fs_reg(), brw_imm_ud(3), brw_imm_ud(0) }; 2973 STATIC_ASSERT(ARRAY_SIZE(srcs) == TEX_LOGICAL_NUM_SRCS); 2974 2975 fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs)); 2976 inst->size_written = 4 * inst->dst.component_size(inst->exec_size); 2977 2978 return inst; 2979} 2980 2981/** 2982 * Actual coherent framebuffer read implemented using the native render target 2983 * read message. Requires SKL+. 2984 */ 2985static fs_inst * 2986emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target) 2987{ 2988 assert(bld.shader->devinfo->gen >= 9); 2989 fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst); 2990 inst->target = target; 2991 inst->size_written = 4 * inst->dst.component_size(inst->exec_size); 2992 2993 return inst; 2994} 2995 2996static fs_reg 2997alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n) 2998{ 2999 if (n && regs[0].file != BAD_FILE) { 3000 return regs[0]; 3001 3002 } else { 3003 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size); 3004 3005 for (unsigned i = 0; i < n; i++) 3006 regs[i] = tmp; 3007 3008 return tmp; 3009 } 3010} 3011 3012static fs_reg 3013alloc_frag_output(fs_visitor *v, unsigned location) 3014{ 3015 assert(v->stage == MESA_SHADER_FRAGMENT); 3016 const brw_wm_prog_key *const key = 3017 reinterpret_cast<const brw_wm_prog_key *>(v->key); 3018 const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION); 3019 const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX); 3020 3021 if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1)) 3022 return alloc_temporary(v->bld, 4, &v->dual_src_output, 1); 3023 3024 else if (l == FRAG_RESULT_COLOR) 3025 return alloc_temporary(v->bld, 4, v->outputs, 3026 MAX2(key->nr_color_regions, 1)); 3027 3028 else if (l == FRAG_RESULT_DEPTH) 3029 return alloc_temporary(v->bld, 1, &v->frag_depth, 1); 3030 3031 else if (l == FRAG_RESULT_STENCIL) 3032 return alloc_temporary(v->bld, 1, &v->frag_stencil, 1); 3033 3034 else if (l == FRAG_RESULT_SAMPLE_MASK) 3035 return alloc_temporary(v->bld, 1, &v->sample_mask, 1); 3036 3037 else if (l >= FRAG_RESULT_DATA0 && 3038 l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS) 3039 return alloc_temporary(v->bld, 4, 3040 &v->outputs[l - FRAG_RESULT_DATA0], 1); 3041 3042 else 3043 unreachable("Invalid location"); 3044} 3045 3046void 3047fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, 3048 nir_intrinsic_instr *instr) 3049{ 3050 assert(stage == MESA_SHADER_FRAGMENT); 3051 3052 fs_reg dest; 3053 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3054 dest = get_nir_dest(instr->dest); 3055 3056 switch (instr->intrinsic) { 3057 case nir_intrinsic_load_front_face: 3058 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), 3059 *emit_frontfacing_interpolation()); 3060 break; 3061 3062 case nir_intrinsic_load_sample_pos: { 3063 fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS]; 3064 assert(sample_pos.file != BAD_FILE); 3065 dest.type = sample_pos.type; 3066 bld.MOV(dest, sample_pos); 3067 bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1)); 3068 break; 3069 } 3070 3071 case nir_intrinsic_load_layer_id: 3072 dest.type = BRW_REGISTER_TYPE_UD; 3073 bld.MOV(dest, fetch_render_target_array_index(bld)); 3074 break; 3075 3076 case nir_intrinsic_load_helper_invocation: 3077 case nir_intrinsic_load_sample_mask_in: 3078 case nir_intrinsic_load_sample_id: { 3079 gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); 3080 fs_reg val = nir_system_values[sv]; 3081 assert(val.file != BAD_FILE); 3082 dest.type = val.type; 3083 bld.MOV(dest, val); 3084 break; 3085 } 3086 3087 case nir_intrinsic_store_output: { 3088 const fs_reg src = get_nir_src(instr->src[0]); 3089 const nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); 3090 assert(const_offset && "Indirect output stores not allowed"); 3091 const unsigned location = nir_intrinsic_base(instr) + 3092 SET_FIELD(const_offset->u32[0], BRW_NIR_FRAG_OUTPUT_LOCATION); 3093 const fs_reg new_dest = retype(alloc_frag_output(this, location), 3094 src.type); 3095 3096 for (unsigned j = 0; j < instr->num_components; j++) 3097 bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j), 3098 offset(src, bld, j)); 3099 3100 break; 3101 } 3102 3103 case nir_intrinsic_load_output: { 3104 const unsigned l = GET_FIELD(nir_intrinsic_base(instr), 3105 BRW_NIR_FRAG_OUTPUT_LOCATION); 3106 assert(l >= FRAG_RESULT_DATA0); 3107 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); 3108 assert(const_offset && "Indirect output loads not allowed"); 3109 const unsigned target = l - FRAG_RESULT_DATA0 + const_offset->u32[0]; 3110 const fs_reg tmp = bld.vgrf(dest.type, 4); 3111 3112 if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch) 3113 emit_coherent_fb_read(bld, tmp, target); 3114 else 3115 emit_non_coherent_fb_read(bld, tmp, target); 3116 3117 for (unsigned j = 0; j < instr->num_components; j++) { 3118 bld.MOV(offset(dest, bld, j), 3119 offset(tmp, bld, nir_intrinsic_component(instr) + j)); 3120 } 3121 3122 break; 3123 } 3124 3125 case nir_intrinsic_discard: 3126 case nir_intrinsic_discard_if: { 3127 /* We track our discarded pixels in f0.1. By predicating on it, we can 3128 * update just the flag bits that aren't yet discarded. If there's no 3129 * condition, we emit a CMP of g0 != g0, so all currently executing 3130 * channels will get turned off. 3131 */ 3132 fs_inst *cmp; 3133 if (instr->intrinsic == nir_intrinsic_discard_if) { 3134 cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]), 3135 brw_imm_d(0), BRW_CONDITIONAL_Z); 3136 } else { 3137 fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0), 3138 BRW_REGISTER_TYPE_UW)); 3139 cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ); 3140 } 3141 cmp->predicate = BRW_PREDICATE_NORMAL; 3142 cmp->flag_subreg = 1; 3143 3144 if (devinfo->gen >= 6) { 3145 emit_discard_jump(); 3146 } 3147 break; 3148 } 3149 3150 case nir_intrinsic_load_input: { 3151 /* load_input is only used for flat inputs */ 3152 unsigned base = nir_intrinsic_base(instr); 3153 unsigned component = nir_intrinsic_component(instr); 3154 unsigned num_components = instr->num_components; 3155 enum brw_reg_type type = dest.type; 3156 3157 /* Special case fields in the VUE header */ 3158 if (base == VARYING_SLOT_LAYER) 3159 component = 1; 3160 else if (base == VARYING_SLOT_VIEWPORT) 3161 component = 2; 3162 3163 if (nir_dest_bit_size(instr->dest) == 64) { 3164 /* const_index is in 32-bit type size units that could not be aligned 3165 * with DF. We need to read the double vector as if it was a float 3166 * vector of twice the number of components to fetch the right data. 3167 */ 3168 type = BRW_REGISTER_TYPE_F; 3169 num_components *= 2; 3170 } 3171 3172 for (unsigned int i = 0; i < num_components; i++) { 3173 struct brw_reg interp = interp_reg(base, component + i); 3174 interp = suboffset(interp, 3); 3175 bld.emit(FS_OPCODE_CINTERP, offset(retype(dest, type), bld, i), 3176 retype(fs_reg(interp), type)); 3177 } 3178 3179 if (nir_dest_bit_size(instr->dest) == 64) { 3180 shuffle_32bit_load_result_to_64bit_data(bld, 3181 dest, 3182 retype(dest, type), 3183 instr->num_components); 3184 } 3185 break; 3186 } 3187 3188 case nir_intrinsic_load_barycentric_pixel: 3189 case nir_intrinsic_load_barycentric_centroid: 3190 case nir_intrinsic_load_barycentric_sample: 3191 /* Do nothing - load_interpolated_input handling will handle it later. */ 3192 break; 3193 3194 case nir_intrinsic_load_barycentric_at_sample: { 3195 const glsl_interp_mode interpolation = 3196 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); 3197 3198 nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]); 3199 3200 if (const_sample) { 3201 unsigned msg_data = const_sample->i32[0] << 4; 3202 3203 emit_pixel_interpolater_send(bld, 3204 FS_OPCODE_INTERPOLATE_AT_SAMPLE, 3205 dest, 3206 fs_reg(), /* src */ 3207 brw_imm_ud(msg_data), 3208 interpolation); 3209 } else { 3210 const fs_reg sample_src = retype(get_nir_src(instr->src[0]), 3211 BRW_REGISTER_TYPE_UD); 3212 3213 if (nir_src_is_dynamically_uniform(instr->src[0])) { 3214 const fs_reg sample_id = bld.emit_uniformize(sample_src); 3215 const fs_reg msg_data = vgrf(glsl_type::uint_type); 3216 bld.exec_all().group(1, 0) 3217 .SHL(msg_data, sample_id, brw_imm_ud(4u)); 3218 emit_pixel_interpolater_send(bld, 3219 FS_OPCODE_INTERPOLATE_AT_SAMPLE, 3220 dest, 3221 fs_reg(), /* src */ 3222 msg_data, 3223 interpolation); 3224 } else { 3225 /* Make a loop that sends a message to the pixel interpolater 3226 * for the sample number in each live channel. If there are 3227 * multiple channels with the same sample number then these 3228 * will be handled simultaneously with a single interation of 3229 * the loop. 3230 */ 3231 bld.emit(BRW_OPCODE_DO); 3232 3233 /* Get the next live sample number into sample_id_reg */ 3234 const fs_reg sample_id = bld.emit_uniformize(sample_src); 3235 3236 /* Set the flag register so that we can perform the send 3237 * message on all channels that have the same sample number 3238 */ 3239 bld.CMP(bld.null_reg_ud(), 3240 sample_src, sample_id, 3241 BRW_CONDITIONAL_EQ); 3242 const fs_reg msg_data = vgrf(glsl_type::uint_type); 3243 bld.exec_all().group(1, 0) 3244 .SHL(msg_data, sample_id, brw_imm_ud(4u)); 3245 fs_inst *inst = 3246 emit_pixel_interpolater_send(bld, 3247 FS_OPCODE_INTERPOLATE_AT_SAMPLE, 3248 dest, 3249 fs_reg(), /* src */ 3250 msg_data, 3251 interpolation); 3252 set_predicate(BRW_PREDICATE_NORMAL, inst); 3253 3254 /* Continue the loop if there are any live channels left */ 3255 set_predicate_inv(BRW_PREDICATE_NORMAL, 3256 true, /* inverse */ 3257 bld.emit(BRW_OPCODE_WHILE)); 3258 } 3259 } 3260 break; 3261 } 3262 3263 case nir_intrinsic_load_barycentric_at_offset: { 3264 const glsl_interp_mode interpolation = 3265 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); 3266 3267 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); 3268 3269 if (const_offset) { 3270 unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & 0xf; 3271 unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) & 0xf; 3272 3273 emit_pixel_interpolater_send(bld, 3274 FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, 3275 dest, 3276 fs_reg(), /* src */ 3277 brw_imm_ud(off_x | (off_y << 4)), 3278 interpolation); 3279 } else { 3280 fs_reg src = vgrf(glsl_type::ivec2_type); 3281 fs_reg offset_src = retype(get_nir_src(instr->src[0]), 3282 BRW_REGISTER_TYPE_F); 3283 for (int i = 0; i < 2; i++) { 3284 fs_reg temp = vgrf(glsl_type::float_type); 3285 bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f)); 3286 fs_reg itemp = vgrf(glsl_type::int_type); 3287 /* float to int */ 3288 bld.MOV(itemp, temp); 3289 3290 /* Clamp the upper end of the range to +7/16. 3291 * ARB_gpu_shader5 requires that we support a maximum offset 3292 * of +0.5, which isn't representable in a S0.4 value -- if 3293 * we didn't clamp it, we'd end up with -8/16, which is the 3294 * opposite of what the shader author wanted. 3295 * 3296 * This is legal due to ARB_gpu_shader5's quantization 3297 * rules: 3298 * 3299 * "Not all values of <offset> may be supported; x and y 3300 * offsets may be rounded to fixed-point values with the 3301 * number of fraction bits given by the 3302 * implementation-dependent constant 3303 * FRAGMENT_INTERPOLATION_OFFSET_BITS" 3304 */ 3305 set_condmod(BRW_CONDITIONAL_L, 3306 bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7))); 3307 } 3308 3309 const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET; 3310 emit_pixel_interpolater_send(bld, 3311 opcode, 3312 dest, 3313 src, 3314 brw_imm_ud(0u), 3315 interpolation); 3316 } 3317 break; 3318 } 3319 3320 case nir_intrinsic_load_interpolated_input: { 3321 if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) { 3322 emit_fragcoord_interpolation(dest); 3323 break; 3324 } 3325 3326 assert(instr->src[0].ssa && 3327 instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic); 3328 nir_intrinsic_instr *bary_intrinsic = 3329 nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr); 3330 nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic; 3331 enum glsl_interp_mode interp_mode = 3332 (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic); 3333 fs_reg dst_xy; 3334 3335 if (bary_intrin == nir_intrinsic_load_barycentric_at_offset || 3336 bary_intrin == nir_intrinsic_load_barycentric_at_sample) { 3337 /* Use the result of the PI message */ 3338 dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F); 3339 } else { 3340 /* Use the delta_xy values computed from the payload */ 3341 enum brw_barycentric_mode bary = 3342 brw_barycentric_mode(interp_mode, bary_intrin); 3343 3344 dst_xy = this->delta_xy[bary]; 3345 } 3346 3347 for (unsigned int i = 0; i < instr->num_components; i++) { 3348 fs_reg interp = 3349 fs_reg(interp_reg(nir_intrinsic_base(instr), 3350 nir_intrinsic_component(instr) + i)); 3351 interp.type = BRW_REGISTER_TYPE_F; 3352 dest.type = BRW_REGISTER_TYPE_F; 3353 3354 if (devinfo->gen < 6 && interp_mode == INTERP_MODE_SMOOTH) { 3355 fs_reg tmp = vgrf(glsl_type::float_type); 3356 bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp); 3357 bld.MUL(offset(dest, bld, i), tmp, this->pixel_w); 3358 } else { 3359 bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp); 3360 } 3361 } 3362 break; 3363 } 3364 3365 default: 3366 nir_emit_intrinsic(bld, instr); 3367 break; 3368 } 3369} 3370 3371void 3372fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, 3373 nir_intrinsic_instr *instr) 3374{ 3375 assert(stage == MESA_SHADER_COMPUTE); 3376 struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data); 3377 3378 fs_reg dest; 3379 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3380 dest = get_nir_dest(instr->dest); 3381 3382 switch (instr->intrinsic) { 3383 case nir_intrinsic_barrier: 3384 emit_barrier(); 3385 cs_prog_data->uses_barrier = true; 3386 break; 3387 3388 case nir_intrinsic_load_local_invocation_id: 3389 case nir_intrinsic_load_work_group_id: { 3390 gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); 3391 fs_reg val = nir_system_values[sv]; 3392 assert(val.file != BAD_FILE); 3393 dest.type = val.type; 3394 for (unsigned i = 0; i < 3; i++) 3395 bld.MOV(offset(dest, bld, i), offset(val, bld, i)); 3396 break; 3397 } 3398 3399 case nir_intrinsic_load_num_work_groups: { 3400 const unsigned surface = 3401 cs_prog_data->binding_table.work_groups_start; 3402 3403 cs_prog_data->uses_num_work_groups = true; 3404 3405 fs_reg surf_index = brw_imm_ud(surface); 3406 brw_mark_surface_used(prog_data, surface); 3407 3408 /* Read the 3 GLuint components of gl_NumWorkGroups */ 3409 for (unsigned i = 0; i < 3; i++) { 3410 fs_reg read_result = 3411 emit_untyped_read(bld, surf_index, 3412 brw_imm_ud(i << 2), 3413 1 /* dims */, 1 /* size */, 3414 BRW_PREDICATE_NONE); 3415 read_result.type = dest.type; 3416 bld.MOV(dest, read_result); 3417 dest = offset(dest, bld, 1); 3418 } 3419 break; 3420 } 3421 3422 case nir_intrinsic_shared_atomic_add: 3423 nir_emit_shared_atomic(bld, BRW_AOP_ADD, instr); 3424 break; 3425 case nir_intrinsic_shared_atomic_imin: 3426 nir_emit_shared_atomic(bld, BRW_AOP_IMIN, instr); 3427 break; 3428 case nir_intrinsic_shared_atomic_umin: 3429 nir_emit_shared_atomic(bld, BRW_AOP_UMIN, instr); 3430 break; 3431 case nir_intrinsic_shared_atomic_imax: 3432 nir_emit_shared_atomic(bld, BRW_AOP_IMAX, instr); 3433 break; 3434 case nir_intrinsic_shared_atomic_umax: 3435 nir_emit_shared_atomic(bld, BRW_AOP_UMAX, instr); 3436 break; 3437 case nir_intrinsic_shared_atomic_and: 3438 nir_emit_shared_atomic(bld, BRW_AOP_AND, instr); 3439 break; 3440 case nir_intrinsic_shared_atomic_or: 3441 nir_emit_shared_atomic(bld, BRW_AOP_OR, instr); 3442 break; 3443 case nir_intrinsic_shared_atomic_xor: 3444 nir_emit_shared_atomic(bld, BRW_AOP_XOR, instr); 3445 break; 3446 case nir_intrinsic_shared_atomic_exchange: 3447 nir_emit_shared_atomic(bld, BRW_AOP_MOV, instr); 3448 break; 3449 case nir_intrinsic_shared_atomic_comp_swap: 3450 nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr); 3451 break; 3452 3453 case nir_intrinsic_load_shared: { 3454 assert(devinfo->gen >= 7); 3455 3456 fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM); 3457 3458 /* Get the offset to read from */ 3459 fs_reg offset_reg; 3460 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); 3461 if (const_offset) { 3462 offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]); 3463 } else { 3464 offset_reg = vgrf(glsl_type::uint_type); 3465 bld.ADD(offset_reg, 3466 retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD), 3467 brw_imm_ud(instr->const_index[0])); 3468 } 3469 3470 /* Read the vector */ 3471 do_untyped_vector_read(bld, dest, surf_index, offset_reg, 3472 instr->num_components); 3473 break; 3474 } 3475 3476 case nir_intrinsic_store_shared: { 3477 assert(devinfo->gen >= 7); 3478 3479 /* Block index */ 3480 fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM); 3481 3482 /* Value */ 3483 fs_reg val_reg = get_nir_src(instr->src[0]); 3484 3485 /* Writemask */ 3486 unsigned writemask = instr->const_index[1]; 3487 3488 /* get_nir_src() retypes to integer. Be wary of 64-bit types though 3489 * since the untyped writes below operate in units of 32-bits, which 3490 * means that we need to write twice as many components each time. 3491 * Also, we have to suffle 64-bit data to be in the appropriate layout 3492 * expected by our 32-bit write messages. 3493 */ 3494 unsigned type_size = 4; 3495 unsigned bit_size = instr->src[0].is_ssa ? 3496 instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size; 3497 if (bit_size == 64) { 3498 type_size = 8; 3499 fs_reg tmp = 3500 fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type); 3501 shuffle_64bit_data_for_32bit_write( 3502 bld, 3503 retype(tmp, BRW_REGISTER_TYPE_F), 3504 retype(val_reg, BRW_REGISTER_TYPE_DF), 3505 instr->num_components); 3506 val_reg = tmp; 3507 } 3508 3509 unsigned type_slots = type_size / 4; 3510 3511 /* Combine groups of consecutive enabled channels in one write 3512 * message. We use ffs to find the first enabled channel and then ffs on 3513 * the bit-inverse, down-shifted writemask to determine the length of 3514 * the block of enabled bits. 3515 */ 3516 while (writemask) { 3517 unsigned first_component = ffs(writemask) - 1; 3518 unsigned length = ffs(~(writemask >> first_component)) - 1; 3519 3520 /* We can't write more than 2 64-bit components at once. Limit the 3521 * length of the write to what we can do and let the next iteration 3522 * handle the rest 3523 */ 3524 if (type_size > 4) 3525 length = MIN2(2, length); 3526 3527 fs_reg offset_reg; 3528 nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); 3529 if (const_offset) { 3530 offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0] + 3531 type_size * first_component); 3532 } else { 3533 offset_reg = vgrf(glsl_type::uint_type); 3534 bld.ADD(offset_reg, 3535 retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD), 3536 brw_imm_ud(instr->const_index[0] + type_size * first_component)); 3537 } 3538 3539 emit_untyped_write(bld, surf_index, offset_reg, 3540 offset(val_reg, bld, first_component * type_slots), 3541 1 /* dims */, length * type_slots, 3542 BRW_PREDICATE_NONE); 3543 3544 /* Clear the bits in the writemask that we just wrote, then try 3545 * again to see if more channels are left. 3546 */ 3547 writemask &= (15 << (first_component + length)); 3548 } 3549 3550 break; 3551 } 3552 3553 default: 3554 nir_emit_intrinsic(bld, instr); 3555 break; 3556 } 3557} 3558 3559void 3560fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) 3561{ 3562 fs_reg dest; 3563 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3564 dest = get_nir_dest(instr->dest); 3565 3566 switch (instr->intrinsic) { 3567 case nir_intrinsic_atomic_counter_inc: 3568 case nir_intrinsic_atomic_counter_dec: 3569 case nir_intrinsic_atomic_counter_read: 3570 case nir_intrinsic_atomic_counter_add: 3571 case nir_intrinsic_atomic_counter_min: 3572 case nir_intrinsic_atomic_counter_max: 3573 case nir_intrinsic_atomic_counter_and: 3574 case nir_intrinsic_atomic_counter_or: 3575 case nir_intrinsic_atomic_counter_xor: 3576 case nir_intrinsic_atomic_counter_exchange: 3577 case nir_intrinsic_atomic_counter_comp_swap: { 3578 if (stage == MESA_SHADER_FRAGMENT && 3579 instr->intrinsic != nir_intrinsic_atomic_counter_read) 3580 brw_wm_prog_data(prog_data)->has_side_effects = true; 3581 3582 /* Get some metadata from the image intrinsic. */ 3583 const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; 3584 3585 /* Get the arguments of the atomic intrinsic. */ 3586 const fs_reg offset = get_nir_src(instr->src[0]); 3587 const unsigned surface = (stage_prog_data->binding_table.abo_start + 3588 instr->const_index[0]); 3589 const fs_reg src0 = (info->num_srcs >= 2 3590 ? get_nir_src(instr->src[1]) : fs_reg()); 3591 const fs_reg src1 = (info->num_srcs >= 3 3592 ? get_nir_src(instr->src[2]) : fs_reg()); 3593 fs_reg tmp; 3594 3595 assert(info->num_srcs <= 3); 3596 3597 /* Emit a surface read or atomic op. */ 3598 if (instr->intrinsic == nir_intrinsic_atomic_counter_read) { 3599 tmp = emit_untyped_read(bld, brw_imm_ud(surface), offset, 1, 1); 3600 } else { 3601 tmp = emit_untyped_atomic(bld, brw_imm_ud(surface), offset, src0, 3602 src1, 1, 1, 3603 get_atomic_counter_op(instr->intrinsic)); 3604 } 3605 3606 /* Assign the result. */ 3607 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), tmp); 3608 3609 /* Mark the surface as used. */ 3610 brw_mark_surface_used(stage_prog_data, surface); 3611 break; 3612 } 3613 3614 case nir_intrinsic_image_load: 3615 case nir_intrinsic_image_store: 3616 case nir_intrinsic_image_atomic_add: 3617 case nir_intrinsic_image_atomic_min: 3618 case nir_intrinsic_image_atomic_max: 3619 case nir_intrinsic_image_atomic_and: 3620 case nir_intrinsic_image_atomic_or: 3621 case nir_intrinsic_image_atomic_xor: 3622 case nir_intrinsic_image_atomic_exchange: 3623 case nir_intrinsic_image_atomic_comp_swap: { 3624 using namespace image_access; 3625 3626 if (stage == MESA_SHADER_FRAGMENT && 3627 instr->intrinsic != nir_intrinsic_image_load) 3628 brw_wm_prog_data(prog_data)->has_side_effects = true; 3629 3630 /* Get the referenced image variable and type. */ 3631 const nir_variable *var = instr->variables[0]->var; 3632 const glsl_type *type = var->type->without_array(); 3633 const brw_reg_type base_type = get_image_base_type(type); 3634 3635 /* Get some metadata from the image intrinsic. */ 3636 const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; 3637 const unsigned arr_dims = type->sampler_array ? 1 : 0; 3638 const unsigned surf_dims = type->coordinate_components() - arr_dims; 3639 const unsigned format = var->data.image.format; 3640 3641 /* Get the arguments of the image intrinsic. */ 3642 const fs_reg image = get_nir_image_deref(instr->variables[0]); 3643 const fs_reg addr = retype(get_nir_src(instr->src[0]), 3644 BRW_REGISTER_TYPE_UD); 3645 const fs_reg src0 = (info->num_srcs >= 3 ? 3646 retype(get_nir_src(instr->src[2]), base_type) : 3647 fs_reg()); 3648 const fs_reg src1 = (info->num_srcs >= 4 ? 3649 retype(get_nir_src(instr->src[3]), base_type) : 3650 fs_reg()); 3651 fs_reg tmp; 3652 3653 /* Emit an image load, store or atomic op. */ 3654 if (instr->intrinsic == nir_intrinsic_image_load) 3655 tmp = emit_image_load(bld, image, addr, surf_dims, arr_dims, format); 3656 3657 else if (instr->intrinsic == nir_intrinsic_image_store) 3658 emit_image_store(bld, image, addr, src0, surf_dims, arr_dims, 3659 var->data.image.write_only ? GL_NONE : format); 3660 3661 else 3662 tmp = emit_image_atomic(bld, image, addr, src0, src1, 3663 surf_dims, arr_dims, info->dest_components, 3664 get_image_atomic_op(instr->intrinsic, type)); 3665 3666 /* Assign the result. */ 3667 for (unsigned c = 0; c < info->dest_components; ++c) 3668 bld.MOV(offset(retype(dest, base_type), bld, c), 3669 offset(tmp, bld, c)); 3670 break; 3671 } 3672 3673 case nir_intrinsic_memory_barrier_atomic_counter: 3674 case nir_intrinsic_memory_barrier_buffer: 3675 case nir_intrinsic_memory_barrier_image: 3676 case nir_intrinsic_memory_barrier: { 3677 const fs_builder ubld = bld.group(8, 0); 3678 const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); 3679 ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp) 3680 ->size_written = 2 * REG_SIZE; 3681 break; 3682 } 3683 3684 case nir_intrinsic_group_memory_barrier: 3685 case nir_intrinsic_memory_barrier_shared: 3686 /* We treat these workgroup-level barriers as no-ops. This should be 3687 * safe at present and as long as: 3688 * 3689 * - Memory access instructions are not subsequently reordered by the 3690 * compiler back-end. 3691 * 3692 * - All threads from a given compute shader workgroup fit within a 3693 * single subslice and therefore talk to the same HDC shared unit 3694 * what supposedly guarantees ordering and coherency between threads 3695 * from the same workgroup. This may change in the future when we 3696 * start splitting workgroups across multiple subslices. 3697 * 3698 * - The context is not in fault-and-stream mode, which could cause 3699 * memory transactions (including to SLM) prior to the barrier to be 3700 * replayed after the barrier if a pagefault occurs. This shouldn't 3701 * be a problem up to and including SKL because fault-and-stream is 3702 * not usable due to hardware issues, but that's likely to change in 3703 * the future. 3704 */ 3705 break; 3706 3707 case nir_intrinsic_shader_clock: { 3708 /* We cannot do anything if there is an event, so ignore it for now */ 3709 const fs_reg shader_clock = get_timestamp(bld); 3710 const fs_reg srcs[] = { component(shader_clock, 0), 3711 component(shader_clock, 1) }; 3712 bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0); 3713 break; 3714 } 3715 3716 case nir_intrinsic_image_size: { 3717 /* Get the referenced image variable and type. */ 3718 const nir_variable *var = instr->variables[0]->var; 3719 const glsl_type *type = var->type->without_array(); 3720 3721 /* Get the size of the image. */ 3722 const fs_reg image = get_nir_image_deref(instr->variables[0]); 3723 const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET); 3724 3725 /* For 1DArray image types, the array index is stored in the Z component. 3726 * Fix this by swizzling the Z component to the Y component. 3727 */ 3728 const bool is_1d_array_image = 3729 type->sampler_dimensionality == GLSL_SAMPLER_DIM_1D && 3730 type->sampler_array; 3731 3732 /* For CubeArray images, we should count the number of cubes instead 3733 * of the number of faces. Fix it by dividing the (Z component) by 6. 3734 */ 3735 const bool is_cube_array_image = 3736 type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE && 3737 type->sampler_array; 3738 3739 /* Copy all the components. */ 3740 const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; 3741 for (unsigned c = 0; c < info->dest_components; ++c) { 3742 if ((int)c >= type->coordinate_components()) { 3743 bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c), 3744 brw_imm_d(1)); 3745 } else if (c == 1 && is_1d_array_image) { 3746 bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c), 3747 offset(size, bld, 2)); 3748 } else if (c == 2 && is_cube_array_image) { 3749 bld.emit(SHADER_OPCODE_INT_QUOTIENT, 3750 offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c), 3751 offset(size, bld, c), brw_imm_d(6)); 3752 } else { 3753 bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c), 3754 offset(size, bld, c)); 3755 } 3756 } 3757 3758 break; 3759 } 3760 3761 case nir_intrinsic_image_samples: 3762 /* The driver does not support multi-sampled images. */ 3763 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1)); 3764 break; 3765 3766 case nir_intrinsic_load_uniform: { 3767 /* Offsets are in bytes but they should always be multiples of 4 */ 3768 assert(instr->const_index[0] % 4 == 0); 3769 3770 fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type); 3771 3772 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); 3773 if (const_offset) { 3774 /* Offsets are in bytes but they should always be multiples of 4 */ 3775 assert(const_offset->u32[0] % 4 == 0); 3776 src.offset = const_offset->u32[0]; 3777 3778 for (unsigned j = 0; j < instr->num_components; j++) { 3779 bld.MOV(offset(dest, bld, j), offset(src, bld, j)); 3780 } 3781 } else { 3782 fs_reg indirect = retype(get_nir_src(instr->src[0]), 3783 BRW_REGISTER_TYPE_UD); 3784 3785 /* We need to pass a size to the MOV_INDIRECT but we don't want it to 3786 * go past the end of the uniform. In order to keep the n'th 3787 * component from running past, we subtract off the size of all but 3788 * one component of the vector. 3789 */ 3790 assert(instr->const_index[1] >= 3791 instr->num_components * (int) type_sz(dest.type)); 3792 unsigned read_size = instr->const_index[1] - 3793 (instr->num_components - 1) * type_sz(dest.type); 3794 3795 bool supports_64bit_indirects = 3796 !devinfo->is_cherryview && !devinfo->is_broxton; 3797 3798 if (type_sz(dest.type) != 8 || supports_64bit_indirects) { 3799 for (unsigned j = 0; j < instr->num_components; j++) { 3800 bld.emit(SHADER_OPCODE_MOV_INDIRECT, 3801 offset(dest, bld, j), offset(src, bld, j), 3802 indirect, brw_imm_ud(read_size)); 3803 } 3804 } else { 3805 const unsigned num_mov_indirects = 3806 type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD); 3807 /* We read a little bit less per MOV INDIRECT, as they are now 3808 * 32-bits ones instead of 64-bit. Fix read_size then. 3809 */ 3810 const unsigned read_size_32bit = read_size - 3811 (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD); 3812 for (unsigned j = 0; j < instr->num_components; j++) { 3813 for (unsigned i = 0; i < num_mov_indirects; i++) { 3814 bld.emit(SHADER_OPCODE_MOV_INDIRECT, 3815 subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i), 3816 subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i), 3817 indirect, brw_imm_ud(read_size_32bit)); 3818 } 3819 } 3820 } 3821 } 3822 break; 3823 } 3824 3825 case nir_intrinsic_load_ubo: { 3826 nir_const_value *const_index = nir_src_as_const_value(instr->src[0]); 3827 fs_reg surf_index; 3828 3829 if (const_index) { 3830 const unsigned index = stage_prog_data->binding_table.ubo_start + 3831 const_index->u32[0]; 3832 surf_index = brw_imm_ud(index); 3833 brw_mark_surface_used(prog_data, index); 3834 } else { 3835 /* The block index is not a constant. Evaluate the index expression 3836 * per-channel and add the base UBO index; we have to select a value 3837 * from any live channel. 3838 */ 3839 surf_index = vgrf(glsl_type::uint_type); 3840 bld.ADD(surf_index, get_nir_src(instr->src[0]), 3841 brw_imm_ud(stage_prog_data->binding_table.ubo_start)); 3842 surf_index = bld.emit_uniformize(surf_index); 3843 3844 /* Assume this may touch any UBO. It would be nice to provide 3845 * a tighter bound, but the array information is already lowered away. 3846 */ 3847 brw_mark_surface_used(prog_data, 3848 stage_prog_data->binding_table.ubo_start + 3849 nir->info->num_ubos - 1); 3850 } 3851 3852 nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); 3853 if (const_offset == NULL) { 3854 fs_reg base_offset = retype(get_nir_src(instr->src[1]), 3855 BRW_REGISTER_TYPE_UD); 3856 3857 for (int i = 0; i < instr->num_components; i++) 3858 VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index, 3859 base_offset, i * type_sz(dest.type)); 3860 } else { 3861 /* Even if we are loading doubles, a pull constant load will load 3862 * a 32-bit vec4, so should only reserve vgrf space for that. If we 3863 * need to load a full dvec4 we will have to emit 2 loads. This is 3864 * similar to demote_pull_constants(), except that in that case we 3865 * see individual accesses to each component of the vector and then 3866 * we let CSE deal with duplicate loads. Here we see a vector access 3867 * and we have to split it if necessary. 3868 */ 3869 const unsigned type_size = type_sz(dest.type); 3870 const unsigned block_sz = 64; /* Fetch one cacheline at a time. */ 3871 const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0); 3872 const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD); 3873 3874 for (unsigned c = 0; c < instr->num_components;) { 3875 const unsigned base = const_offset->u32[0] + c * type_size; 3876 /* Number of usable components in the next block-aligned load. */ 3877 const unsigned count = MIN2(instr->num_components - c, 3878 (block_sz - base % block_sz) / type_size); 3879 3880 ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 3881 packed_consts, surf_index, 3882 brw_imm_ud(base & ~(block_sz - 1))); 3883 3884 const fs_reg consts = 3885 retype(byte_offset(packed_consts, base & (block_sz - 1)), 3886 dest.type); 3887 3888 for (unsigned d = 0; d < count; d++) 3889 bld.MOV(offset(dest, bld, c + d), component(consts, d)); 3890 3891 c += count; 3892 } 3893 } 3894 break; 3895 } 3896 3897 case nir_intrinsic_load_ssbo: { 3898 assert(devinfo->gen >= 7); 3899 3900 nir_const_value *const_uniform_block = 3901 nir_src_as_const_value(instr->src[0]); 3902 3903 fs_reg surf_index; 3904 if (const_uniform_block) { 3905 unsigned index = stage_prog_data->binding_table.ssbo_start + 3906 const_uniform_block->u32[0]; 3907 surf_index = brw_imm_ud(index); 3908 brw_mark_surface_used(prog_data, index); 3909 } else { 3910 surf_index = vgrf(glsl_type::uint_type); 3911 bld.ADD(surf_index, get_nir_src(instr->src[0]), 3912 brw_imm_ud(stage_prog_data->binding_table.ssbo_start)); 3913 3914 /* Assume this may touch any UBO. It would be nice to provide 3915 * a tighter bound, but the array information is already lowered away. 3916 */ 3917 brw_mark_surface_used(prog_data, 3918 stage_prog_data->binding_table.ssbo_start + 3919 nir->info->num_ssbos - 1); 3920 } 3921 3922 fs_reg offset_reg; 3923 nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); 3924 if (const_offset) { 3925 offset_reg = brw_imm_ud(const_offset->u32[0]); 3926 } else { 3927 offset_reg = get_nir_src(instr->src[1]); 3928 } 3929 3930 /* Read the vector */ 3931 do_untyped_vector_read(bld, dest, surf_index, offset_reg, 3932 instr->num_components); 3933 3934 break; 3935 } 3936 3937 case nir_intrinsic_store_ssbo: { 3938 assert(devinfo->gen >= 7); 3939 3940 if (stage == MESA_SHADER_FRAGMENT) 3941 brw_wm_prog_data(prog_data)->has_side_effects = true; 3942 3943 /* Block index */ 3944 fs_reg surf_index; 3945 nir_const_value *const_uniform_block = 3946 nir_src_as_const_value(instr->src[1]); 3947 if (const_uniform_block) { 3948 unsigned index = stage_prog_data->binding_table.ssbo_start + 3949 const_uniform_block->u32[0]; 3950 surf_index = brw_imm_ud(index); 3951 brw_mark_surface_used(prog_data, index); 3952 } else { 3953 surf_index = vgrf(glsl_type::uint_type); 3954 bld.ADD(surf_index, get_nir_src(instr->src[1]), 3955 brw_imm_ud(stage_prog_data->binding_table.ssbo_start)); 3956 3957 brw_mark_surface_used(prog_data, 3958 stage_prog_data->binding_table.ssbo_start + 3959 nir->info->num_ssbos - 1); 3960 } 3961 3962 /* Value */ 3963 fs_reg val_reg = get_nir_src(instr->src[0]); 3964 3965 /* Writemask */ 3966 unsigned writemask = instr->const_index[0]; 3967 3968 /* get_nir_src() retypes to integer. Be wary of 64-bit types though 3969 * since the untyped writes below operate in units of 32-bits, which 3970 * means that we need to write twice as many components each time. 3971 * Also, we have to suffle 64-bit data to be in the appropriate layout 3972 * expected by our 32-bit write messages. 3973 */ 3974 unsigned type_size = 4; 3975 unsigned bit_size = instr->src[0].is_ssa ? 3976 instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size; 3977 if (bit_size == 64) { 3978 type_size = 8; 3979 fs_reg tmp = 3980 fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type); 3981 shuffle_64bit_data_for_32bit_write(bld, 3982 retype(tmp, BRW_REGISTER_TYPE_F), 3983 retype(val_reg, BRW_REGISTER_TYPE_DF), 3984 instr->num_components); 3985 val_reg = tmp; 3986 } 3987 3988 unsigned type_slots = type_size / 4; 3989 3990 /* Combine groups of consecutive enabled channels in one write 3991 * message. We use ffs to find the first enabled channel and then ffs on 3992 * the bit-inverse, down-shifted writemask to determine the length of 3993 * the block of enabled bits. 3994 */ 3995 while (writemask) { 3996 unsigned first_component = ffs(writemask) - 1; 3997 unsigned length = ffs(~(writemask >> first_component)) - 1; 3998 3999 /* We can't write more than 2 64-bit components at once. Limit the 4000 * length of the write to what we can do and let the next iteration 4001 * handle the rest 4002 */ 4003 if (type_size > 4) 4004 length = MIN2(2, length); 4005 4006 fs_reg offset_reg; 4007 nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]); 4008 if (const_offset) { 4009 offset_reg = brw_imm_ud(const_offset->u32[0] + 4010 type_size * first_component); 4011 } else { 4012 offset_reg = vgrf(glsl_type::uint_type); 4013 bld.ADD(offset_reg, 4014 retype(get_nir_src(instr->src[2]), BRW_REGISTER_TYPE_UD), 4015 brw_imm_ud(type_size * first_component)); 4016 } 4017 4018 4019 emit_untyped_write(bld, surf_index, offset_reg, 4020 offset(val_reg, bld, first_component * type_slots), 4021 1 /* dims */, length * type_slots, 4022 BRW_PREDICATE_NONE); 4023 4024 /* Clear the bits in the writemask that we just wrote, then try 4025 * again to see if more channels are left. 4026 */ 4027 writemask &= (15 << (first_component + length)); 4028 } 4029 break; 4030 } 4031 4032 case nir_intrinsic_store_output: { 4033 fs_reg src = get_nir_src(instr->src[0]); 4034 4035 nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); 4036 assert(const_offset && "Indirect output stores not allowed"); 4037 fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld, 4038 4 * const_offset->u32[0]), src.type); 4039 4040 unsigned num_components = instr->num_components; 4041 unsigned first_component = nir_intrinsic_component(instr); 4042 unsigned bit_size = instr->src[0].is_ssa ? 4043 instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size; 4044 if (bit_size == 64) { 4045 fs_reg tmp = 4046 fs_reg(VGRF, alloc.allocate(2 * num_components), 4047 BRW_REGISTER_TYPE_F); 4048 shuffle_64bit_data_for_32bit_write( 4049 bld, tmp, retype(src, BRW_REGISTER_TYPE_DF), num_components); 4050 src = retype(tmp, src.type); 4051 num_components *= 2; 4052 } 4053 4054 for (unsigned j = 0; j < num_components; j++) { 4055 bld.MOV(offset(new_dest, bld, j + first_component), 4056 offset(src, bld, j)); 4057 } 4058 break; 4059 } 4060 4061 case nir_intrinsic_ssbo_atomic_add: 4062 nir_emit_ssbo_atomic(bld, BRW_AOP_ADD, instr); 4063 break; 4064 case nir_intrinsic_ssbo_atomic_imin: 4065 nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr); 4066 break; 4067 case nir_intrinsic_ssbo_atomic_umin: 4068 nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr); 4069 break; 4070 case nir_intrinsic_ssbo_atomic_imax: 4071 nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr); 4072 break; 4073 case nir_intrinsic_ssbo_atomic_umax: 4074 nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr); 4075 break; 4076 case nir_intrinsic_ssbo_atomic_and: 4077 nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr); 4078 break; 4079 case nir_intrinsic_ssbo_atomic_or: 4080 nir_emit_ssbo_atomic(bld, BRW_AOP_OR, instr); 4081 break; 4082 case nir_intrinsic_ssbo_atomic_xor: 4083 nir_emit_ssbo_atomic(bld, BRW_AOP_XOR, instr); 4084 break; 4085 case nir_intrinsic_ssbo_atomic_exchange: 4086 nir_emit_ssbo_atomic(bld, BRW_AOP_MOV, instr); 4087 break; 4088 case nir_intrinsic_ssbo_atomic_comp_swap: 4089 nir_emit_ssbo_atomic(bld, BRW_AOP_CMPWR, instr); 4090 break; 4091 4092 case nir_intrinsic_get_buffer_size: { 4093 nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]); 4094 unsigned ssbo_index = const_uniform_block ? const_uniform_block->u32[0] : 0; 4095 4096 /* A resinfo's sampler message is used to get the buffer size. The 4097 * SIMD8's writeback message consists of four registers and SIMD16's 4098 * writeback message consists of 8 destination registers (two per each 4099 * component). Because we are only interested on the first channel of 4100 * the first returned component, where resinfo returns the buffer size 4101 * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of 4102 * the dispatch width. 4103 */ 4104 const fs_builder ubld = bld.exec_all().group(8, 0); 4105 fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD); 4106 fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4); 4107 4108 /* Set LOD = 0 */ 4109 ubld.MOV(src_payload, brw_imm_d(0)); 4110 4111 const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index; 4112 fs_inst *inst = ubld.emit(FS_OPCODE_GET_BUFFER_SIZE, ret_payload, 4113 src_payload, brw_imm_ud(index)); 4114 inst->header_size = 0; 4115 inst->mlen = 1; 4116 inst->size_written = 4 * REG_SIZE; 4117 4118 bld.MOV(retype(dest, ret_payload.type), component(ret_payload, 0)); 4119 brw_mark_surface_used(prog_data, index); 4120 break; 4121 } 4122 4123 case nir_intrinsic_load_channel_num: { 4124 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UW); 4125 dest = retype(dest, BRW_REGISTER_TYPE_UD); 4126 const fs_builder allbld8 = bld.group(8, 0).exec_all(); 4127 allbld8.MOV(tmp, brw_imm_v(0x76543210)); 4128 if (dispatch_width > 8) 4129 allbld8.ADD(byte_offset(tmp, 16), tmp, brw_imm_uw(8u)); 4130 if (dispatch_width > 16) { 4131 const fs_builder allbld16 = bld.group(16, 0).exec_all(); 4132 allbld16.ADD(byte_offset(tmp, 32), tmp, brw_imm_uw(16u)); 4133 } 4134 bld.MOV(dest, tmp); 4135 break; 4136 } 4137 4138 default: 4139 unreachable("unknown intrinsic"); 4140 } 4141} 4142 4143void 4144fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld, 4145 int op, nir_intrinsic_instr *instr) 4146{ 4147 if (stage == MESA_SHADER_FRAGMENT) 4148 brw_wm_prog_data(prog_data)->has_side_effects = true; 4149 4150 fs_reg dest; 4151 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 4152 dest = get_nir_dest(instr->dest); 4153 4154 fs_reg surface; 4155 nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]); 4156 if (const_surface) { 4157 unsigned surf_index = stage_prog_data->binding_table.ssbo_start + 4158 const_surface->u32[0]; 4159 surface = brw_imm_ud(surf_index); 4160 brw_mark_surface_used(prog_data, surf_index); 4161 } else { 4162 surface = vgrf(glsl_type::uint_type); 4163 bld.ADD(surface, get_nir_src(instr->src[0]), 4164 brw_imm_ud(stage_prog_data->binding_table.ssbo_start)); 4165 4166 /* Assume this may touch any SSBO. This is the same we do for other 4167 * UBO/SSBO accesses with non-constant surface. 4168 */ 4169 brw_mark_surface_used(prog_data, 4170 stage_prog_data->binding_table.ssbo_start + 4171 nir->info->num_ssbos - 1); 4172 } 4173 4174 fs_reg offset = get_nir_src(instr->src[1]); 4175 fs_reg data1 = get_nir_src(instr->src[2]); 4176 fs_reg data2; 4177 if (op == BRW_AOP_CMPWR) 4178 data2 = get_nir_src(instr->src[3]); 4179 4180 /* Emit the actual atomic operation */ 4181 4182 fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset, 4183 data1, data2, 4184 1 /* dims */, 1 /* rsize */, 4185 op, 4186 BRW_PREDICATE_NONE); 4187 dest.type = atomic_result.type; 4188 bld.MOV(dest, atomic_result); 4189} 4190 4191void 4192fs_visitor::nir_emit_shared_atomic(const fs_builder &bld, 4193 int op, nir_intrinsic_instr *instr) 4194{ 4195 fs_reg dest; 4196 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 4197 dest = get_nir_dest(instr->dest); 4198 4199 fs_reg surface = brw_imm_ud(GEN7_BTI_SLM); 4200 fs_reg offset; 4201 fs_reg data1 = get_nir_src(instr->src[1]); 4202 fs_reg data2; 4203 if (op == BRW_AOP_CMPWR) 4204 data2 = get_nir_src(instr->src[2]); 4205 4206 /* Get the offset */ 4207 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); 4208 if (const_offset) { 4209 offset = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]); 4210 } else { 4211 offset = vgrf(glsl_type::uint_type); 4212 bld.ADD(offset, 4213 retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD), 4214 brw_imm_ud(instr->const_index[0])); 4215 } 4216 4217 /* Emit the actual atomic operation operation */ 4218 4219 fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset, 4220 data1, data2, 4221 1 /* dims */, 1 /* rsize */, 4222 op, 4223 BRW_PREDICATE_NONE); 4224 dest.type = atomic_result.type; 4225 bld.MOV(dest, atomic_result); 4226} 4227 4228void 4229fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr) 4230{ 4231 unsigned texture = instr->texture_index; 4232 unsigned sampler = instr->sampler_index; 4233 4234 fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; 4235 4236 srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture); 4237 srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler); 4238 4239 int lod_components = 0; 4240 4241 /* The hardware requires a LOD for buffer textures */ 4242 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) 4243 srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0); 4244 4245 uint32_t header_bits = 0; 4246 for (unsigned i = 0; i < instr->num_srcs; i++) { 4247 fs_reg src = get_nir_src(instr->src[i].src); 4248 switch (instr->src[i].src_type) { 4249 case nir_tex_src_bias: 4250 srcs[TEX_LOGICAL_SRC_LOD] = 4251 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F); 4252 break; 4253 case nir_tex_src_comparator: 4254 srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F); 4255 break; 4256 case nir_tex_src_coord: 4257 switch (instr->op) { 4258 case nir_texop_txf: 4259 case nir_texop_txf_ms: 4260 case nir_texop_txf_ms_mcs: 4261 case nir_texop_samples_identical: 4262 srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D); 4263 break; 4264 default: 4265 srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F); 4266 break; 4267 } 4268 break; 4269 case nir_tex_src_ddx: 4270 srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F); 4271 lod_components = nir_tex_instr_src_size(instr, i); 4272 break; 4273 case nir_tex_src_ddy: 4274 srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F); 4275 break; 4276 case nir_tex_src_lod: 4277 switch (instr->op) { 4278 case nir_texop_txs: 4279 srcs[TEX_LOGICAL_SRC_LOD] = 4280 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD); 4281 break; 4282 case nir_texop_txf: 4283 srcs[TEX_LOGICAL_SRC_LOD] = 4284 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D); 4285 break; 4286 default: 4287 srcs[TEX_LOGICAL_SRC_LOD] = 4288 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F); 4289 break; 4290 } 4291 break; 4292 case nir_tex_src_ms_index: 4293 srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD); 4294 break; 4295 4296 case nir_tex_src_offset: { 4297 nir_const_value *const_offset = 4298 nir_src_as_const_value(instr->src[i].src); 4299 unsigned offset_bits = 0; 4300 if (const_offset && 4301 brw_texture_offset(const_offset->i32, 4302 nir_tex_instr_src_size(instr, i), 4303 &offset_bits)) { 4304 header_bits |= offset_bits; 4305 } else { 4306 srcs[TEX_LOGICAL_SRC_TG4_OFFSET] = 4307 retype(src, BRW_REGISTER_TYPE_D); 4308 } 4309 break; 4310 } 4311 4312 case nir_tex_src_projector: 4313 unreachable("should be lowered"); 4314 4315 case nir_tex_src_texture_offset: { 4316 /* Figure out the highest possible texture index and mark it as used */ 4317 uint32_t max_used = texture + instr->texture_array_size - 1; 4318 if (instr->op == nir_texop_tg4 && devinfo->gen < 8) { 4319 max_used += stage_prog_data->binding_table.gather_texture_start; 4320 } else { 4321 max_used += stage_prog_data->binding_table.texture_start; 4322 } 4323 brw_mark_surface_used(prog_data, max_used); 4324 4325 /* Emit code to evaluate the actual indexing expression */ 4326 fs_reg tmp = vgrf(glsl_type::uint_type); 4327 bld.ADD(tmp, src, brw_imm_ud(texture)); 4328 srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp); 4329 break; 4330 } 4331 4332 case nir_tex_src_sampler_offset: { 4333 /* Emit code to evaluate the actual indexing expression */ 4334 fs_reg tmp = vgrf(glsl_type::uint_type); 4335 bld.ADD(tmp, src, brw_imm_ud(sampler)); 4336 srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp); 4337 break; 4338 } 4339 4340 case nir_tex_src_ms_mcs: 4341 assert(instr->op == nir_texop_txf_ms); 4342 srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D); 4343 break; 4344 4345 case nir_tex_src_plane: { 4346 nir_const_value *const_plane = 4347 nir_src_as_const_value(instr->src[i].src); 4348 const uint32_t plane = const_plane->u32[0]; 4349 const uint32_t texture_index = 4350 instr->texture_index + 4351 stage_prog_data->binding_table.plane_start[plane] - 4352 stage_prog_data->binding_table.texture_start; 4353 4354 srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture_index); 4355 break; 4356 } 4357 4358 default: 4359 unreachable("unknown texture source"); 4360 } 4361 } 4362 4363 if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE && 4364 (instr->op == nir_texop_txf_ms || 4365 instr->op == nir_texop_samples_identical)) { 4366 if (devinfo->gen >= 7 && 4367 key_tex->compressed_multisample_layout_mask & (1 << texture)) { 4368 srcs[TEX_LOGICAL_SRC_MCS] = 4369 emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE], 4370 instr->coord_components, 4371 srcs[TEX_LOGICAL_SRC_SURFACE]); 4372 } else { 4373 srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u); 4374 } 4375 } 4376 4377 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components); 4378 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components); 4379 4380 enum opcode opcode; 4381 switch (instr->op) { 4382 case nir_texop_tex: 4383 opcode = (stage == MESA_SHADER_FRAGMENT ? SHADER_OPCODE_TEX_LOGICAL : 4384 SHADER_OPCODE_TXL_LOGICAL); 4385 break; 4386 case nir_texop_txb: 4387 opcode = FS_OPCODE_TXB_LOGICAL; 4388 break; 4389 case nir_texop_txl: 4390 opcode = SHADER_OPCODE_TXL_LOGICAL; 4391 break; 4392 case nir_texop_txd: 4393 opcode = SHADER_OPCODE_TXD_LOGICAL; 4394 break; 4395 case nir_texop_txf: 4396 opcode = SHADER_OPCODE_TXF_LOGICAL; 4397 break; 4398 case nir_texop_txf_ms: 4399 if ((key_tex->msaa_16 & (1 << sampler))) 4400 opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL; 4401 else 4402 opcode = SHADER_OPCODE_TXF_CMS_LOGICAL; 4403 break; 4404 case nir_texop_txf_ms_mcs: 4405 opcode = SHADER_OPCODE_TXF_MCS_LOGICAL; 4406 break; 4407 case nir_texop_query_levels: 4408 case nir_texop_txs: 4409 opcode = SHADER_OPCODE_TXS_LOGICAL; 4410 break; 4411 case nir_texop_lod: 4412 opcode = SHADER_OPCODE_LOD_LOGICAL; 4413 break; 4414 case nir_texop_tg4: 4415 if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE) 4416 opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL; 4417 else 4418 opcode = SHADER_OPCODE_TG4_LOGICAL; 4419 break; 4420 case nir_texop_texture_samples: 4421 opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL; 4422 break; 4423 case nir_texop_samples_identical: { 4424 fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D); 4425 4426 /* If mcs is an immediate value, it means there is no MCS. In that case 4427 * just return false. 4428 */ 4429 if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) { 4430 bld.MOV(dst, brw_imm_ud(0u)); 4431 } else if ((key_tex->msaa_16 & (1 << sampler))) { 4432 fs_reg tmp = vgrf(glsl_type::uint_type); 4433 bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS], 4434 offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1)); 4435 bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ); 4436 } else { 4437 bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u), 4438 BRW_CONDITIONAL_EQ); 4439 } 4440 return; 4441 } 4442 default: 4443 unreachable("unknown texture opcode"); 4444 } 4445 4446 /* TXS and TXL require a LOD but not everything we implement using those 4447 * two opcodes provides one. Provide a default LOD of 0. 4448 */ 4449 if ((opcode == SHADER_OPCODE_TXS_LOGICAL || 4450 opcode == SHADER_OPCODE_TXL_LOGICAL) && 4451 srcs[TEX_LOGICAL_SRC_LOD].file == BAD_FILE) { 4452 srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_ud(0u); 4453 } 4454 4455 if (instr->op == nir_texop_tg4) { 4456 if (instr->component == 1 && 4457 key_tex->gather_channel_quirk_mask & (1 << texture)) { 4458 /* gather4 sampler is broken for green channel on RG32F -- 4459 * we must ask for blue instead. 4460 */ 4461 header_bits |= 2 << 16; 4462 } else { 4463 header_bits |= instr->component << 16; 4464 } 4465 } 4466 4467 fs_reg dst = bld.vgrf(brw_type_for_nir_type(instr->dest_type), 4); 4468 fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs)); 4469 inst->offset = header_bits; 4470 4471 const unsigned dest_size = nir_tex_instr_dest_size(instr); 4472 if (devinfo->gen >= 9 && 4473 instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) { 4474 unsigned write_mask = instr->dest.is_ssa ? 4475 nir_ssa_def_components_read(&instr->dest.ssa): 4476 (1 << dest_size) - 1; 4477 assert(write_mask != 0); /* dead code should have been eliminated */ 4478 inst->size_written = util_last_bit(write_mask) * 4479 inst->dst.component_size(inst->exec_size); 4480 } else { 4481 inst->size_written = 4 * inst->dst.component_size(inst->exec_size); 4482 } 4483 4484 if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE) 4485 inst->shadow_compare = true; 4486 4487 if (instr->op == nir_texop_tg4 && devinfo->gen == 6) 4488 emit_gen6_gather_wa(key_tex->gen6_gather_wa[texture], dst); 4489 4490 fs_reg nir_dest[4]; 4491 for (unsigned i = 0; i < dest_size; i++) 4492 nir_dest[i] = offset(dst, bld, i); 4493 4494 if (instr->op == nir_texop_query_levels) { 4495 /* # levels is in .w */ 4496 nir_dest[0] = offset(dst, bld, 3); 4497 } else if (instr->op == nir_texop_txs && 4498 dest_size >= 3 && devinfo->gen < 7) { 4499 /* Gen4-6 return 0 instead of 1 for single layer surfaces. */ 4500 fs_reg depth = offset(dst, bld, 2); 4501 nir_dest[2] = vgrf(glsl_type::int_type); 4502 bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE); 4503 } 4504 4505 bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0); 4506} 4507 4508void 4509fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr) 4510{ 4511 switch (instr->type) { 4512 case nir_jump_break: 4513 bld.emit(BRW_OPCODE_BREAK); 4514 break; 4515 case nir_jump_continue: 4516 bld.emit(BRW_OPCODE_CONTINUE); 4517 break; 4518 case nir_jump_return: 4519 default: 4520 unreachable("unknown jump"); 4521 } 4522} 4523 4524/** 4525 * This helper takes the result of a load operation that reads 32-bit elements 4526 * in this format: 4527 * 4528 * x x x x x x x x 4529 * y y y y y y y y 4530 * z z z z z z z z 4531 * w w w w w w w w 4532 * 4533 * and shuffles the data to get this: 4534 * 4535 * x y x y x y x y 4536 * x y x y x y x y 4537 * z w z w z w z w 4538 * z w z w z w z w 4539 * 4540 * Which is exactly what we want if the load is reading 64-bit components 4541 * like doubles, where x represents the low 32-bit of the x double component 4542 * and y represents the high 32-bit of the x double component (likewise with 4543 * z and w for double component y). The parameter @components represents 4544 * the number of 64-bit components present in @src. This would typically be 4545 * 2 at most, since we can only fit 2 double elements in the result of a 4546 * vec4 load. 4547 * 4548 * Notice that @dst and @src can be the same register. 4549 */ 4550void 4551shuffle_32bit_load_result_to_64bit_data(const fs_builder &bld, 4552 const fs_reg &dst, 4553 const fs_reg &src, 4554 uint32_t components) 4555{ 4556 assert(type_sz(src.type) == 4); 4557 assert(type_sz(dst.type) == 8); 4558 4559 /* A temporary that we will use to shuffle the 32-bit data of each 4560 * component in the vector into valid 64-bit data. We can't write directly 4561 * to dst because dst can be (and would usually be) the same as src 4562 * and in that case the first MOV in the loop below would overwrite the 4563 * data read in the second MOV. 4564 */ 4565 fs_reg tmp = bld.vgrf(dst.type); 4566 4567 for (unsigned i = 0; i < components; i++) { 4568 const fs_reg component_i = offset(src, bld, 2 * i); 4569 4570 bld.MOV(subscript(tmp, src.type, 0), component_i); 4571 bld.MOV(subscript(tmp, src.type, 1), offset(component_i, bld, 1)); 4572 4573 bld.MOV(offset(dst, bld, i), tmp); 4574 } 4575} 4576 4577/** 4578 * This helper does the inverse operation of 4579 * SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA. 4580 * 4581 * We need to do this when we are going to use untyped write messsages that 4582 * operate with 32-bit components in order to arrange our 64-bit data to be 4583 * in the expected layout. 4584 * 4585 * Notice that callers of this function, unlike in the case of the inverse 4586 * operation, would typically need to call this with dst and src being 4587 * different registers, since they would otherwise corrupt the original 4588 * 64-bit data they are about to write. Because of this the function checks 4589 * that the src and dst regions involved in the operation do not overlap. 4590 */ 4591void 4592shuffle_64bit_data_for_32bit_write(const fs_builder &bld, 4593 const fs_reg &dst, 4594 const fs_reg &src, 4595 uint32_t components) 4596{ 4597 assert(type_sz(src.type) == 8); 4598 assert(type_sz(dst.type) == 4); 4599 4600 assert(!regions_overlap( 4601 dst, 2 * components * dst.component_size(bld.dispatch_width()), 4602 src, components * src.component_size(bld.dispatch_width()))); 4603 4604 for (unsigned i = 0; i < components; i++) { 4605 const fs_reg component_i = offset(src, bld, i); 4606 bld.MOV(offset(dst, bld, 2 * i), subscript(component_i, dst.type, 0)); 4607 bld.MOV(offset(dst, bld, 2 * i + 1), subscript(component_i, dst.type, 1)); 4608 } 4609} 4610 4611fs_reg 4612setup_imm_df(const fs_builder &bld, double v) 4613{ 4614 const struct gen_device_info *devinfo = bld.shader->devinfo; 4615 assert(devinfo->gen >= 7); 4616 4617 if (devinfo->gen >= 8) 4618 return brw_imm_df(v); 4619 4620 /* gen7.5 does not support DF immediates straighforward but the DIM 4621 * instruction allows to set the 64-bit immediate value. 4622 */ 4623 if (devinfo->is_haswell) { 4624 const fs_builder ubld = bld.exec_all().group(1, 0); 4625 fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1); 4626 ubld.DIM(dst, brw_imm_df(v)); 4627 return component(dst, 0); 4628 } 4629 4630 /* gen7 does not support DF immediates, so we generate a 64-bit constant by 4631 * writing the low 32-bit of the constant to suboffset 0 of a VGRF and 4632 * the high 32-bit to suboffset 4 and then applying a stride of 0. 4633 * 4634 * Alternatively, we could also produce a normal VGRF (without stride 0) 4635 * by writing to all the channels in the VGRF, however, that would hit the 4636 * gen7 bug where we have to split writes that span more than 1 register 4637 * into instructions with a width of 4 (otherwise the write to the second 4638 * register written runs into an execmask hardware bug) which isn't very 4639 * nice. 4640 */ 4641 union { 4642 double d; 4643 struct { 4644 uint32_t i1; 4645 uint32_t i2; 4646 }; 4647 } di; 4648 4649 di.d = v; 4650 4651 const fs_builder ubld = bld.exec_all().group(1, 0); 4652 const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); 4653 ubld.MOV(tmp, brw_imm_ud(di.i1)); 4654 ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2)); 4655 4656 return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0); 4657} 4658