gen6_gs_visitor.cpp revision 6ac2bbec16d73f0cc58fc520c4165239461c59b3
1/* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * This code is based on original work by Ilia Mirkin. 24 */ 25 26/** 27 * \file gen6_gs_visitor.cpp 28 * 29 * Gen6 geometry shader implementation 30 */ 31 32#include "gen6_gs_visitor.h" 33 34namespace brw { 35 36void 37gen6_gs_visitor::emit_prolog() 38{ 39 vec4_gs_visitor::emit_prolog(); 40 41 /* Gen6 geometry shaders require to allocate an initial VUE handle via 42 * FF_SYNC message, however the documentation remarks that only one thread 43 * can write to the URB simultaneously and the FF_SYNC message provides the 44 * synchronization mechanism for this, so using this message effectively 45 * stalls the thread until it is its turn to write to the URB. Because of 46 * this, the best way to implement geometry shader algorithms in gen6 is to 47 * execute the algorithm before the FF_SYNC message to maximize parallelism. 48 * 49 * To achieve this we buffer the geometry shader outputs for each emitted 50 * vertex in vertex_output during operation. Then, when we have processed 51 * the last vertex (that is, at thread end time), we send the FF_SYNC 52 * message to allocate the initial VUE handle and write all buffered vertex 53 * data to the URB in one go. 54 * 55 * For each emitted vertex, vertex_output will hold vue_map.num_slots 56 * data items plus one additional item to hold required flags 57 * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message) 58 * which come right after the data items for that vertex. Vertex data and 59 * flags for the next vertex come right after the data items and flags for 60 * the previous vertex. 61 */ 62 this->current_annotation = "gen6 prolog"; 63 this->vertex_output = src_reg(this, 64 glsl_type::uint_type, 65 (prog_data->vue_map.num_slots + 1) * 66 nir->info.gs.vertices_out); 67 this->vertex_output_offset = src_reg(this, glsl_type::uint_type); 68 emit(MOV(dst_reg(this->vertex_output_offset), src_reg(0u))); 69 70 /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES), 71 * so initialize it once to R0. 72 */ 73 vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1), 74 retype(brw_vec8_grf(0, 0), 75 BRW_REGISTER_TYPE_UD))); 76 inst->force_writemask_all = true; 77 78 /* This will be used as a temporary to store writeback data of FF_SYNC 79 * and URB_WRITE messages. 80 */ 81 this->temp = src_reg(this, glsl_type::uint_type); 82 83 /* This will be used to know when we are processing the first vertex of 84 * a primitive. We will set this to URB_WRITE_PRIM_START only when we know 85 * that we are processing the first vertex in the primitive and to zero 86 * otherwise. This way we can use its value directly in the URB write 87 * headers. 88 */ 89 this->first_vertex = src_reg(this, glsl_type::uint_type); 90 emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START)); 91 92 /* The FF_SYNC message requires to know the number of primitives generated, 93 * so keep a counter for this. 94 */ 95 this->prim_count = src_reg(this, glsl_type::uint_type); 96 emit(MOV(dst_reg(this->prim_count), 0u)); 97 98 if (c->prog_data.gen6_xfb_enabled) { 99 /* Create a virtual register to hold destination indices in SOL */ 100 this->destination_indices = src_reg(this, glsl_type::uvec4_type); 101 /* Create a virtual register to hold number of written primitives */ 102 this->sol_prim_written = src_reg(this, glsl_type::uint_type); 103 /* Create a virtual register to hold Streamed Vertex Buffer Indices */ 104 this->svbi = src_reg(this, glsl_type::uvec4_type); 105 /* Create a virtual register to hold max values of SVBI */ 106 this->max_svbi = src_reg(this, glsl_type::uvec4_type); 107 emit(MOV(dst_reg(this->max_svbi), 108 src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD)))); 109 110 xfb_setup(); 111 } 112 113 /* PrimitveID is delivered in r0.1 of the thread payload. If the program 114 * needs it we have to move it to a separate register where we can map 115 * the atttribute. 116 * 117 * Notice that we cannot use a virtual register for this, because we need to 118 * map all input attributes to hardware registers in setup_payload(), 119 * which happens before virtual registers are mapped to hardware registers. 120 * We could work around that issue if we were able to compute the first 121 * non-payload register here and move the PrimitiveID information to that 122 * register, but we can't because at this point we don't know the final 123 * number uniforms that will be included in the payload. 124 * 125 * So, what we do is to place PrimitiveID information in r1, which is always 126 * delivered as part of the payload, but its only populated with data 127 * relevant for transform feedback when we set GEN6_GS_SVBI_PAYLOAD_ENABLE 128 * in the 3DSTATE_GS state packet. That information can be obtained by other 129 * means though, so we can safely use r1 for this purpose. 130 */ 131 if (c->prog_data.include_primitive_id) { 132 this->primitive_id = 133 src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); 134 emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id)); 135 } 136} 137 138void 139gen6_gs_visitor::gs_emit_vertex(int stream_id) 140{ 141 this->current_annotation = "gen6 emit vertex"; 142 143 /* Buffer all output slots for this vertex in vertex_output */ 144 for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) { 145 int varying = prog_data->vue_map.slot_to_varying[slot]; 146 if (varying != VARYING_SLOT_PSIZ) { 147 dst_reg dst(this->vertex_output); 148 dst.reladdr = ralloc(mem_ctx, src_reg); 149 memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg)); 150 emit_urb_slot(dst, varying); 151 } else { 152 /* The PSIZ slot can pack multiple varyings in different channels 153 * and emit_urb_slot() will produce a MOV instruction for each of 154 * them. Since we are writing to an array, that will translate to 155 * possibly multiple MOV instructions with an array destination and 156 * each will generate a scratch write with the same offset into 157 * scratch space (thus, each one overwriting the previous). This is 158 * not what we want. What we will do instead is emit PSIZ to a 159 * a regular temporary register, then move that resgister into the 160 * array. This way we only have one instruction with an array 161 * destination and we only produce a single scratch write. 162 */ 163 dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type)); 164 emit_urb_slot(tmp, varying); 165 dst_reg dst(this->vertex_output); 166 dst.reladdr = ralloc(mem_ctx, src_reg); 167 memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg)); 168 vec4_instruction *inst = emit(MOV(dst, src_reg(tmp))); 169 inst->force_writemask_all = true; 170 } 171 172 emit(ADD(dst_reg(this->vertex_output_offset), 173 this->vertex_output_offset, 1u)); 174 } 175 176 /* Now buffer flags for this vertex */ 177 dst_reg dst(this->vertex_output); 178 dst.reladdr = ralloc(mem_ctx, src_reg); 179 memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg)); 180 if (nir->info.gs.output_primitive == GL_POINTS) { 181 /* If we are outputting points, then every vertex has PrimStart and 182 * PrimEnd set. 183 */ 184 emit(MOV(dst, (_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) | 185 URB_WRITE_PRIM_START | URB_WRITE_PRIM_END)); 186 emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u)); 187 } else { 188 /* Otherwise, we can only set the PrimStart flag, which we have stored 189 * in the first_vertex register. We will have to wait until we execute 190 * EndPrimitive() or we end the thread to set the PrimEnd flag on a 191 * vertex. 192 */ 193 emit(OR(dst, this->first_vertex, 194 (c->prog_data.output_topology << URB_WRITE_PRIM_TYPE_SHIFT))); 195 emit(MOV(dst_reg(this->first_vertex), 0u)); 196 } 197 emit(ADD(dst_reg(this->vertex_output_offset), 198 this->vertex_output_offset, 1u)); 199} 200 201void 202gen6_gs_visitor::gs_end_primitive() 203{ 204 this->current_annotation = "gen6 end primitive"; 205 /* Calling EndPrimitive() is optional for point output. In this case we set 206 * the PrimEnd flag when we process EmitVertex(). 207 */ 208 if (nir->info.gs.output_primitive == GL_POINTS) 209 return; 210 211 /* Otherwise we know that the last vertex we have processed was the last 212 * vertex in the primitive and we need to set its PrimEnd flag, so do this 213 * unless we haven't emitted that vertex at all (vertex_count != 0). 214 * 215 * Notice that we have already incremented vertex_count when we processed 216 * the last emit_vertex, so we need to take that into account in the 217 * comparison below (hence the num_output_vertices + 1 in the comparison 218 * below). 219 */ 220 unsigned num_output_vertices = nir->info.gs.vertices_out; 221 emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices + 1), 222 BRW_CONDITIONAL_L)); 223 vec4_instruction *inst = emit(CMP(dst_null_d(), 224 this->vertex_count, 0u, 225 BRW_CONDITIONAL_NEQ)); 226 inst->predicate = BRW_PREDICATE_NORMAL; 227 emit(IF(BRW_PREDICATE_NORMAL)); 228 { 229 /* vertex_output_offset is already pointing at the first entry of the 230 * next vertex. So subtract 1 to modify the flags for the previous 231 * vertex. 232 */ 233 src_reg offset(this, glsl_type::uint_type); 234 emit(ADD(dst_reg(offset), this->vertex_output_offset, src_reg(-1))); 235 236 src_reg dst(this->vertex_output); 237 dst.reladdr = ralloc(mem_ctx, src_reg); 238 memcpy(dst.reladdr, &offset, sizeof(src_reg)); 239 240 emit(OR(dst_reg(dst), dst, URB_WRITE_PRIM_END)); 241 emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u)); 242 243 /* Set the first vertex flag to indicate that the next vertex will start 244 * a primitive. 245 */ 246 emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START)); 247 } 248 emit(BRW_OPCODE_ENDIF); 249} 250 251void 252gen6_gs_visitor::emit_urb_write_header(int mrf) 253{ 254 this->current_annotation = "gen6 urb header"; 255 /* Compute offset of the flags for the current vertex in vertex_output and 256 * write them in dw2 of the message header. 257 * 258 * Notice that by the time that emit_thread_end() calls here 259 * vertex_output_offset should point to the first data item of the current 260 * vertex in vertex_output, thus we only need to add the number of output 261 * slots per vertex to that offset to obtain the flags data offset. 262 */ 263 src_reg flags_offset(this, glsl_type::uint_type); 264 emit(ADD(dst_reg(flags_offset), 265 this->vertex_output_offset, src_reg(prog_data->vue_map.num_slots))); 266 267 src_reg flags_data(this->vertex_output); 268 flags_data.reladdr = ralloc(mem_ctx, src_reg); 269 memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg)); 270 271 emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data); 272} 273 274static int 275align_interleaved_urb_mlen(int mlen) 276{ 277 /* URB data written (does not include the message header reg) must 278 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5, 279 * section 5.4.3.2.2: URB_INTERLEAVED. 280 */ 281 if ((mlen % 2) != 1) 282 mlen++; 283 return mlen; 284} 285 286void 287gen6_gs_visitor::emit_urb_write_opcode(bool complete, int base_mrf, 288 int last_mrf, int urb_offset) 289{ 290 vec4_instruction *inst = NULL; 291 292 if (!complete) { 293 /* If the vertex is not complete we don't have to do anything special */ 294 inst = emit(GS_OPCODE_URB_WRITE); 295 inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS; 296 } else { 297 /* Otherwise we always request to allocate a new VUE handle. If this is 298 * the last write before the EOT message and the new handle never gets 299 * used it will be dereferenced when we send the EOT message. This is 300 * necessary to avoid different setups for the EOT message (one for the 301 * case when there is no output and another for the case when there is) 302 * which would require to end the program with an IF/ELSE/ENDIF block, 303 * something we do not want. 304 */ 305 inst = emit(GS_OPCODE_URB_WRITE_ALLOCATE); 306 inst->urb_write_flags = BRW_URB_WRITE_COMPLETE; 307 inst->dst = dst_reg(MRF, base_mrf); 308 inst->src[0] = this->temp; 309 } 310 311 inst->base_mrf = base_mrf; 312 inst->mlen = align_interleaved_urb_mlen(last_mrf - base_mrf); 313 inst->offset = urb_offset; 314} 315 316void 317gen6_gs_visitor::emit_thread_end() 318{ 319 /* Make sure the current primitive is ended: we know it is not ended when 320 * first_vertex is not zero. This is only relevant for outputs other than 321 * points because in the point case we set PrimEnd on all vertices. 322 */ 323 if (nir->info.gs.output_primitive != GL_POINTS) { 324 emit(CMP(dst_null_d(), this->first_vertex, 0u, BRW_CONDITIONAL_Z)); 325 emit(IF(BRW_PREDICATE_NORMAL)); 326 gs_end_primitive(); 327 emit(BRW_OPCODE_ENDIF); 328 } 329 330 /* Here we have to: 331 * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle. 332 * 2) Loop over all buffered vertex data and write it to corresponding 333 * URB entries. 334 * 3) Allocate new VUE handles for all vertices other than the first. 335 * 4) Send a final EOT message. 336 */ 337 338 /* MRF 0 is reserved for the debugger, so start with message header 339 * in MRF 1. 340 */ 341 int base_mrf = 1; 342 343 /* In the process of generating our URB write message contents, we 344 * may need to unspill a register or load from an array. Those 345 * reads would use MRFs 21..23 346 */ 347 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen); 348 349 /* Issue the FF_SYNC message and obtain the initial VUE handle. */ 350 emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_G)); 351 emit(IF(BRW_PREDICATE_NORMAL)); 352 { 353 this->current_annotation = "gen6 thread end: ff_sync"; 354 355 vec4_instruction *inst; 356 if (c->prog_data.gen6_xfb_enabled) { 357 src_reg sol_temp(this, glsl_type::uvec4_type); 358 emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES, 359 dst_reg(this->svbi), 360 this->vertex_count, 361 this->prim_count, 362 sol_temp); 363 inst = emit(GS_OPCODE_FF_SYNC, 364 dst_reg(this->temp), this->prim_count, this->svbi); 365 } else { 366 inst = emit(GS_OPCODE_FF_SYNC, 367 dst_reg(this->temp), this->prim_count, src_reg(0u)); 368 } 369 inst->base_mrf = base_mrf; 370 371 /* Loop over all buffered vertices and emit URB write messages */ 372 this->current_annotation = "gen6 thread end: urb writes init"; 373 src_reg vertex(this, glsl_type::uint_type); 374 emit(MOV(dst_reg(vertex), 0u)); 375 emit(MOV(dst_reg(this->vertex_output_offset), 0u)); 376 377 this->current_annotation = "gen6 thread end: urb writes"; 378 emit(BRW_OPCODE_DO); 379 { 380 emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE)); 381 inst = emit(BRW_OPCODE_BREAK); 382 inst->predicate = BRW_PREDICATE_NORMAL; 383 384 /* First we prepare the message header */ 385 emit_urb_write_header(base_mrf); 386 387 /* Then add vertex data to the message in interleaved fashion */ 388 int slot = 0; 389 bool complete = false; 390 do { 391 int mrf = base_mrf + 1; 392 393 /* URB offset is in URB row increments, and each of our MRFs is half 394 * of one of those, since we're doing interleaved writes. 395 */ 396 int urb_offset = slot / 2; 397 398 for (; slot < prog_data->vue_map.num_slots; ++slot) { 399 int varying = prog_data->vue_map.slot_to_varying[slot]; 400 current_annotation = output_reg_annotation[varying]; 401 402 /* Compute offset of this slot for the current vertex 403 * in vertex_output 404 */ 405 src_reg data(this->vertex_output); 406 data.reladdr = ralloc(mem_ctx, src_reg); 407 memcpy(data.reladdr, &this->vertex_output_offset, 408 sizeof(src_reg)); 409 410 /* Copy this slot to the appropriate message register */ 411 dst_reg reg = dst_reg(MRF, mrf); 412 reg.type = output_reg[varying].type; 413 data.type = reg.type; 414 vec4_instruction *inst = emit(MOV(reg, data)); 415 inst->force_writemask_all = true; 416 417 mrf++; 418 emit(ADD(dst_reg(this->vertex_output_offset), 419 this->vertex_output_offset, 1u)); 420 421 /* If this was max_usable_mrf, we can't fit anything more into 422 * this URB WRITE. Same if we reached the max. message length. 423 */ 424 if (mrf > max_usable_mrf || 425 align_interleaved_urb_mlen(mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) { 426 slot++; 427 break; 428 } 429 } 430 431 complete = slot >= prog_data->vue_map.num_slots; 432 emit_urb_write_opcode(complete, base_mrf, mrf, urb_offset); 433 } while (!complete); 434 435 /* Skip over the flags data item so that vertex_output_offset points 436 * to the first data item of the next vertex, so that we can start 437 * writing the next vertex. 438 */ 439 emit(ADD(dst_reg(this->vertex_output_offset), 440 this->vertex_output_offset, 1u)); 441 442 emit(ADD(dst_reg(vertex), vertex, 1u)); 443 } 444 emit(BRW_OPCODE_WHILE); 445 446 if (c->prog_data.gen6_xfb_enabled) 447 xfb_write(); 448 } 449 emit(BRW_OPCODE_ENDIF); 450 451 /* Finally, emit EOT message. 452 * 453 * In gen6 we need to end the thread differently depending on whether we have 454 * emitted at least one vertex or not. In case we did, the EOT message must 455 * always include the COMPLETE flag or else the GPU hangs. If we have not 456 * produced any output we can't use the COMPLETE flag. 457 * 458 * However, this would lead us to end the program with an ENDIF opcode, 459 * which we want to avoid, so what we do is that we always request a new 460 * VUE handle every time we do a URB WRITE, even for the last vertex we emit. 461 * With this we make sure that whether we have emitted at least one vertex 462 * or none at all, we have to finish the thread without writing to the URB, 463 * which works for both cases by setting the COMPLETE and UNUSED flags in 464 * the EOT message. 465 */ 466 this->current_annotation = "gen6 thread end: EOT"; 467 468 if (c->prog_data.gen6_xfb_enabled) { 469 /* When emitting EOT, set SONumPrimsWritten Increment Value. */ 470 src_reg data(this, glsl_type::uint_type); 471 emit(AND(dst_reg(data), this->sol_prim_written, src_reg(0xffffu))); 472 emit(SHL(dst_reg(data), data, src_reg(16u))); 473 emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data); 474 } 475 476 vec4_instruction *inst = emit(GS_OPCODE_THREAD_END); 477 inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED; 478 inst->base_mrf = base_mrf; 479 inst->mlen = 1; 480} 481 482void 483gen6_gs_visitor::setup_payload() 484{ 485 int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES]; 486 487 /* Attributes are going to be interleaved, so one register contains two 488 * attribute slots. 489 */ 490 int attributes_per_reg = 2; 491 492 /* If a geometry shader tries to read from an input that wasn't written by 493 * the vertex shader, that produces undefined results, but it shouldn't 494 * crash anything. So initialize attribute_map to zeros--that ensures that 495 * these undefined results are read from r0. 496 */ 497 memset(attribute_map, 0, sizeof(attribute_map)); 498 499 int reg = 0; 500 501 /* The payload always contains important data in r0. */ 502 reg++; 503 504 /* r1 is always part of the payload and it holds information relevant 505 * for transform feedback when we set the GEN6_GS_SVBI_PAYLOAD_ENABLE bit in 506 * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID 507 * information (and move the original value to a virtual register if 508 * necessary). 509 */ 510 if (c->prog_data.include_primitive_id) 511 attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg; 512 reg++; 513 514 reg = setup_uniforms(reg); 515 516 reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg); 517 518 lower_attributes_to_hw_regs(attribute_map, true); 519 520 this->first_non_payload_grf = reg; 521} 522 523void 524gen6_gs_visitor::xfb_setup() 525{ 526 static const unsigned swizzle_for_offset[4] = { 527 BRW_SWIZZLE4(0, 1, 2, 3), 528 BRW_SWIZZLE4(1, 2, 3, 3), 529 BRW_SWIZZLE4(2, 3, 3, 3), 530 BRW_SWIZZLE4(3, 3, 3, 3) 531 }; 532 533 struct brw_gs_prog_data *prog_data = 534 (struct brw_gs_prog_data *) &c->prog_data; 535 536 const struct gl_transform_feedback_info *linked_xfb_info = 537 &this->shader_prog->LinkedTransformFeedback; 538 int i; 539 540 /* Make sure that the VUE slots won't overflow the unsigned chars in 541 * prog_data->transform_feedback_bindings[]. 542 */ 543 STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256); 544 545 /* Make sure that we don't need more binding table entries than we've 546 * set aside for use in transform feedback. (We shouldn't, since we 547 * set aside enough binding table entries to have one per component). 548 */ 549 assert(linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS); 550 551 prog_data->num_transform_feedback_bindings = linked_xfb_info->NumOutputs; 552 for (i = 0; i < prog_data->num_transform_feedback_bindings; i++) { 553 prog_data->transform_feedback_bindings[i] = 554 linked_xfb_info->Outputs[i].OutputRegister; 555 prog_data->transform_feedback_swizzles[i] = 556 swizzle_for_offset[linked_xfb_info->Outputs[i].ComponentOffset]; 557 } 558} 559 560void 561gen6_gs_visitor::xfb_write() 562{ 563 unsigned num_verts; 564 struct brw_gs_prog_data *prog_data = 565 (struct brw_gs_prog_data *) &c->prog_data; 566 567 if (!prog_data->num_transform_feedback_bindings) 568 return; 569 570 switch (c->prog_data.output_topology) { 571 case _3DPRIM_POINTLIST: 572 num_verts = 1; 573 break; 574 case _3DPRIM_LINELIST: 575 case _3DPRIM_LINESTRIP: 576 case _3DPRIM_LINELOOP: 577 num_verts = 2; 578 break; 579 case _3DPRIM_TRILIST: 580 case _3DPRIM_TRIFAN: 581 case _3DPRIM_TRISTRIP: 582 case _3DPRIM_RECTLIST: 583 num_verts = 3; 584 break; 585 case _3DPRIM_QUADLIST: 586 case _3DPRIM_QUADSTRIP: 587 case _3DPRIM_POLYGON: 588 num_verts = 3; 589 break; 590 default: 591 unreachable("Unexpected primitive type in Gen6 SOL program."); 592 } 593 594 this->current_annotation = "gen6 thread end: svb writes init"; 595 596 emit(MOV(dst_reg(this->vertex_output_offset), 0u)); 597 emit(MOV(dst_reg(this->sol_prim_written), 0u)); 598 599 /* Check that at least one primitive can be written 600 * 601 * Note: since we use the binding table to keep track of buffer offsets 602 * and stride, the GS doesn't need to keep track of a separate pointer 603 * into each buffer; it uses a single pointer which increments by 1 for 604 * each vertex. So we use SVBI0 for this pointer, regardless of whether 605 * transform feedback is in interleaved or separate attribs mode. 606 */ 607 src_reg sol_temp(this, glsl_type::uvec4_type); 608 emit(ADD(dst_reg(sol_temp), this->svbi, src_reg(num_verts))); 609 610 /* Compare SVBI calculated number with the maximum value, which is 611 * in R1.4 (previously saved in this->max_svbi) for gen6. 612 */ 613 emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE)); 614 emit(IF(BRW_PREDICATE_NORMAL)); 615 { 616 src_reg destination_indices_uw = 617 retype(destination_indices, BRW_REGISTER_TYPE_UW); 618 619 vec4_instruction *inst = emit(MOV(dst_reg(destination_indices_uw), 620 brw_imm_v(0x00020100))); /* (0, 1, 2) */ 621 inst->force_writemask_all = true; 622 623 emit(ADD(dst_reg(this->destination_indices), 624 this->destination_indices, 625 this->svbi)); 626 } 627 emit(BRW_OPCODE_ENDIF); 628 629 /* Write transform feedback data for all processed vertices. */ 630 for (int i = 0; i < (int)nir->info.gs.vertices_out; i++) { 631 emit(MOV(dst_reg(sol_temp), i)); 632 emit(CMP(dst_null_d(), sol_temp, this->vertex_count, 633 BRW_CONDITIONAL_L)); 634 emit(IF(BRW_PREDICATE_NORMAL)); 635 { 636 xfb_program(i, num_verts); 637 } 638 emit(BRW_OPCODE_ENDIF); 639 } 640} 641 642void 643gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts) 644{ 645 struct brw_gs_prog_data *prog_data = 646 (struct brw_gs_prog_data *) &c->prog_data; 647 unsigned binding; 648 unsigned num_bindings = prog_data->num_transform_feedback_bindings; 649 src_reg sol_temp(this, glsl_type::uvec4_type); 650 651 /* Check for buffer overflow: we need room to write the complete primitive 652 * (all vertices). Otherwise, avoid writing any vertices for it 653 */ 654 emit(ADD(dst_reg(sol_temp), this->sol_prim_written, 1u)); 655 emit(MUL(dst_reg(sol_temp), sol_temp, src_reg(num_verts))); 656 emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi)); 657 emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE)); 658 emit(IF(BRW_PREDICATE_NORMAL)); 659 { 660 /* Avoid overwriting MRF 1 as it is used as URB write message header */ 661 dst_reg mrf_reg(MRF, 2); 662 663 this->current_annotation = "gen6: emit SOL vertex data"; 664 /* For each vertex, generate code to output each varying using the 665 * appropriate binding table entry. 666 */ 667 for (binding = 0; binding < num_bindings; ++binding) { 668 unsigned char varying = 669 prog_data->transform_feedback_bindings[binding]; 670 671 /* Set up the correct destination index for this vertex */ 672 vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX, 673 mrf_reg, 674 this->destination_indices); 675 inst->sol_vertex = vertex % num_verts; 676 677 /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1: 678 * 679 * "Prior to End of Thread with a URB_WRITE, the kernel must 680 * ensure that all writes are complete by sending the final 681 * write as a committed write." 682 */ 683 bool final_write = binding == (unsigned) num_bindings - 1 && 684 inst->sol_vertex == num_verts - 1; 685 686 /* Compute offset of this varying for the current vertex 687 * in vertex_output 688 */ 689 this->current_annotation = output_reg_annotation[varying]; 690 src_reg data(this->vertex_output); 691 data.reladdr = ralloc(mem_ctx, src_reg); 692 int offset = get_vertex_output_offset_for_varying(vertex, varying); 693 emit(MOV(dst_reg(this->vertex_output_offset), offset)); 694 memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg)); 695 data.type = output_reg[varying].type; 696 697 /* PSIZ, LAYER and VIEWPORT are packed in different channels of the 698 * same slot, so make sure we write the appropriate channel 699 */ 700 if (varying == VARYING_SLOT_PSIZ) 701 data.swizzle = BRW_SWIZZLE_WWWW; 702 else if (varying == VARYING_SLOT_LAYER) 703 data.swizzle = BRW_SWIZZLE_YYYY; 704 else if (varying == VARYING_SLOT_VIEWPORT) 705 data.swizzle = BRW_SWIZZLE_ZZZZ; 706 else 707 data.swizzle = prog_data->transform_feedback_swizzles[binding]; 708 709 /* Write data */ 710 inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp); 711 inst->sol_binding = binding; 712 inst->sol_final_write = final_write; 713 714 if (final_write) { 715 /* This is the last vertex of the primitive, then increment 716 * SO num primitive counter and destination indices. 717 */ 718 emit(ADD(dst_reg(this->destination_indices), 719 this->destination_indices, 720 src_reg(num_verts))); 721 emit(ADD(dst_reg(this->sol_prim_written), 722 this->sol_prim_written, 1u)); 723 } 724 725 } 726 this->current_annotation = NULL; 727 } 728 emit(BRW_OPCODE_ENDIF); 729} 730 731int 732gen6_gs_visitor::get_vertex_output_offset_for_varying(int vertex, int varying) 733{ 734 /* Find the output slot assigned to this varying. 735 * 736 * VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot 737 * as VARYING_SLOT_PSIZ. 738 */ 739 if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT) 740 varying = VARYING_SLOT_PSIZ; 741 int slot = prog_data->vue_map.varying_to_slot[varying]; 742 743 if (slot < 0) { 744 /* This varying does not exist in the VUE so we are not writing to it 745 * and its value is undefined. We still want to return a valid offset 746 * into vertex_output though, to prevent any out-of-bound accesses into 747 * the vertex_output array. Since the value for this varying is undefined 748 * we don't really care for the value we assign to it, so any offset 749 * within the limits of vertex_output will do. 750 */ 751 slot = 0; 752 } 753 754 return vertex * (prog_data->vue_map.num_slots + 1) + slot; 755} 756 757} /* namespace brw */ 758