1/*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * This code is based on original work by Ilia Mirkin.
24 */
25
26/**
27 * \file gen6_gs_visitor.cpp
28 *
29 * Gen6 geometry shader implementation
30 */
31
32#include "gen6_gs_visitor.h"
33#include "brw_eu.h"
34
35namespace brw {
36
37void
38gen6_gs_visitor::emit_prolog()
39{
40   vec4_gs_visitor::emit_prolog();
41
42   /* Gen6 geometry shaders require to allocate an initial VUE handle via
43    * FF_SYNC message, however the documentation remarks that only one thread
44    * can write to the URB simultaneously and the FF_SYNC message provides the
45    * synchronization mechanism for this, so using this message effectively
46    * stalls the thread until it is its turn to write to the URB. Because of
47    * this, the best way to implement geometry shader algorithms in gen6 is to
48    * execute the algorithm before the FF_SYNC message to maximize parallelism.
49    *
50    * To achieve this we buffer the geometry shader outputs for each emitted
51    * vertex in vertex_output during operation. Then, when we have processed
52    * the last vertex (that is, at thread end time), we send the FF_SYNC
53    * message to allocate the initial VUE handle and write all buffered vertex
54    * data to the URB in one go.
55    *
56    * For each emitted vertex, vertex_output will hold vue_map.num_slots
57    * data items plus one additional item to hold required flags
58    * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
59    * which come right after the data items for that vertex. Vertex data and
60    * flags for the next vertex come right after the data items and flags for
61    * the previous vertex.
62    */
63   this->current_annotation = "gen6 prolog";
64   this->vertex_output = src_reg(this,
65                                 glsl_type::uint_type,
66                                 (prog_data->vue_map.num_slots + 1) *
67                                 nir->info->gs.vertices_out);
68   this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
69   emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
70
71   /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
72    * so initialize it once to R0.
73    */
74   vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
75                                     retype(brw_vec8_grf(0, 0),
76                                            BRW_REGISTER_TYPE_UD)));
77   inst->force_writemask_all = true;
78
79   /* This will be used as a temporary to store writeback data of FF_SYNC
80    * and URB_WRITE messages.
81    */
82   this->temp = src_reg(this, glsl_type::uint_type);
83
84   /* This will be used to know when we are processing the first vertex of
85    * a primitive. We will set this to URB_WRITE_PRIM_START only when we know
86    * that we are processing the first vertex in the primitive and to zero
87    * otherwise. This way we can use its value directly in the URB write
88    * headers.
89    */
90   this->first_vertex = src_reg(this, glsl_type::uint_type);
91   emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(URB_WRITE_PRIM_START)));
92
93   /* The FF_SYNC message requires to know the number of primitives generated,
94    * so keep a counter for this.
95    */
96   this->prim_count = src_reg(this, glsl_type::uint_type);
97   emit(MOV(dst_reg(this->prim_count), brw_imm_ud(0u)));
98
99   if (prog->info.has_transform_feedback_varyings) {
100      /* Create a virtual register to hold destination indices in SOL */
101      this->destination_indices = src_reg(this, glsl_type::uvec4_type);
102      /* Create a virtual register to hold number of written primitives */
103      this->sol_prim_written = src_reg(this, glsl_type::uint_type);
104      /* Create a virtual register to hold Streamed Vertex Buffer Indices */
105      this->svbi = src_reg(this, glsl_type::uvec4_type);
106      /* Create a virtual register to hold max values of SVBI */
107      this->max_svbi = src_reg(this, glsl_type::uvec4_type);
108      emit(MOV(dst_reg(this->max_svbi),
109               src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD))));
110
111      xfb_setup();
112   }
113
114   /* PrimitveID is delivered in r0.1 of the thread payload. If the program
115    * needs it we have to move it to a separate register where we can map
116    * the atttribute.
117    *
118    * Notice that we cannot use a virtual register for this, because we need to
119    * map all input attributes to hardware registers in setup_payload(),
120    * which happens before virtual registers are mapped to hardware registers.
121    * We could work around that issue if we were able to compute the first
122    * non-payload register here and move the PrimitiveID information to that
123    * register, but we can't because at this point we don't know the final
124    * number uniforms that will be included in the payload.
125    *
126    * So, what we do is to place PrimitiveID information in r1, which is always
127    * delivered as part of the payload, but its only populated with data
128    * relevant for transform feedback when we set GEN6_GS_SVBI_PAYLOAD_ENABLE
129    * in the 3DSTATE_GS state packet. That information can be obtained by other
130    * means though, so we can safely use r1 for this purpose.
131    */
132   if (gs_prog_data->include_primitive_id) {
133      this->primitive_id =
134         src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
135      emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
136   }
137}
138
139void
140gen6_gs_visitor::gs_emit_vertex(int stream_id)
141{
142   this->current_annotation = "gen6 emit vertex";
143
144   /* Buffer all output slots for this vertex in vertex_output */
145   for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
146      int varying = prog_data->vue_map.slot_to_varying[slot];
147      if (varying != VARYING_SLOT_PSIZ) {
148         dst_reg dst(this->vertex_output);
149         dst.reladdr = ralloc(mem_ctx, src_reg);
150         memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
151         emit_urb_slot(dst, varying);
152      } else {
153         /* The PSIZ slot can pack multiple varyings in different channels
154          * and emit_urb_slot() will produce a MOV instruction for each of
155          * them. Since we are writing to an array, that will translate to
156          * possibly multiple MOV instructions with an array destination and
157          * each will generate a scratch write with the same offset into
158          * scratch space (thus, each one overwriting the previous). This is
159          * not what we want. What we will do instead is emit PSIZ to a
160          * a regular temporary register, then move that resgister into the
161          * array. This way we only have one instruction with an array
162          * destination and we only produce a single scratch write.
163          */
164         dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type));
165         emit_urb_slot(tmp, varying);
166         dst_reg dst(this->vertex_output);
167         dst.reladdr = ralloc(mem_ctx, src_reg);
168         memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
169         vec4_instruction *inst = emit(MOV(dst, src_reg(tmp)));
170         inst->force_writemask_all = true;
171      }
172
173      emit(ADD(dst_reg(this->vertex_output_offset),
174               this->vertex_output_offset, brw_imm_ud(1u)));
175   }
176
177   /* Now buffer flags for this vertex */
178   dst_reg dst(this->vertex_output);
179   dst.reladdr = ralloc(mem_ctx, src_reg);
180   memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
181   if (nir->info->gs.output_primitive == GL_POINTS) {
182      /* If we are outputting points, then every vertex has PrimStart and
183       * PrimEnd set.
184       */
185      emit(MOV(dst, brw_imm_d((_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
186                              URB_WRITE_PRIM_START | URB_WRITE_PRIM_END)));
187      emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u)));
188   } else {
189      /* Otherwise, we can only set the PrimStart flag, which we have stored
190       * in the first_vertex register. We will have to wait until we execute
191       * EndPrimitive() or we end the thread to set the PrimEnd flag on a
192       * vertex.
193       */
194      emit(OR(dst, this->first_vertex,
195              brw_imm_ud(gs_prog_data->output_topology <<
196                         URB_WRITE_PRIM_TYPE_SHIFT)));
197      emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(0u)));
198   }
199   emit(ADD(dst_reg(this->vertex_output_offset),
200            this->vertex_output_offset, brw_imm_ud(1u)));
201}
202
203void
204gen6_gs_visitor::gs_end_primitive()
205{
206   this->current_annotation = "gen6 end primitive";
207   /* Calling EndPrimitive() is optional for point output. In this case we set
208    * the PrimEnd flag when we process EmitVertex().
209    */
210   if (nir->info->gs.output_primitive == GL_POINTS)
211      return;
212
213   /* Otherwise we know that the last vertex we have processed was the last
214    * vertex in the primitive and we need to set its PrimEnd flag, so do this
215    * unless we haven't emitted that vertex at all (vertex_count != 0).
216    *
217    * Notice that we have already incremented vertex_count when we processed
218    * the last emit_vertex, so we need to take that into account in the
219    * comparison below (hence the num_output_vertices + 1 in the comparison
220    * below).
221    */
222   unsigned num_output_vertices = nir->info->gs.vertices_out;
223   emit(CMP(dst_null_ud(), this->vertex_count,
224            brw_imm_ud(num_output_vertices + 1), BRW_CONDITIONAL_L));
225   vec4_instruction *inst = emit(CMP(dst_null_ud(),
226                                     this->vertex_count, brw_imm_ud(0u),
227                                     BRW_CONDITIONAL_NEQ));
228   inst->predicate = BRW_PREDICATE_NORMAL;
229   emit(IF(BRW_PREDICATE_NORMAL));
230   {
231      /* vertex_output_offset is already pointing at the first entry of the
232       * next vertex. So subtract 1 to modify the flags for the previous
233       * vertex.
234       */
235      src_reg offset(this, glsl_type::uint_type);
236      emit(ADD(dst_reg(offset), this->vertex_output_offset, brw_imm_d(-1)));
237
238      src_reg dst(this->vertex_output);
239      dst.reladdr = ralloc(mem_ctx, src_reg);
240      memcpy(dst.reladdr, &offset, sizeof(src_reg));
241
242      emit(OR(dst_reg(dst), dst, brw_imm_d(URB_WRITE_PRIM_END)));
243      emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u)));
244
245      /* Set the first vertex flag to indicate that the next vertex will start
246       * a primitive.
247       */
248      emit(MOV(dst_reg(this->first_vertex), brw_imm_d(URB_WRITE_PRIM_START)));
249   }
250   emit(BRW_OPCODE_ENDIF);
251}
252
253void
254gen6_gs_visitor::emit_urb_write_header(int mrf)
255{
256   this->current_annotation = "gen6 urb header";
257   /* Compute offset of the flags for the current vertex in vertex_output and
258    * write them in dw2 of the message header.
259    *
260    * Notice that by the time that emit_thread_end() calls here
261    * vertex_output_offset should point to the first data item of the current
262    * vertex in vertex_output, thus we only need to add the number of output
263    * slots per vertex to that offset to obtain the flags data offset.
264    */
265   src_reg flags_offset(this, glsl_type::uint_type);
266   emit(ADD(dst_reg(flags_offset),
267            this->vertex_output_offset,
268            brw_imm_d(prog_data->vue_map.num_slots)));
269
270   src_reg flags_data(this->vertex_output);
271   flags_data.reladdr = ralloc(mem_ctx, src_reg);
272   memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg));
273
274   emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
275}
276
277static int
278align_interleaved_urb_mlen(int mlen)
279{
280   /* URB data written (does not include the message header reg) must
281    * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
282    * section 5.4.3.2.2: URB_INTERLEAVED.
283    */
284   if ((mlen % 2) != 1)
285      mlen++;
286   return mlen;
287}
288
289void
290gen6_gs_visitor::emit_urb_write_opcode(bool complete, int base_mrf,
291                                       int last_mrf, int urb_offset)
292{
293   vec4_instruction *inst = NULL;
294
295   if (!complete) {
296      /* If the vertex is not complete we don't have to do anything special */
297      inst = emit(GS_OPCODE_URB_WRITE);
298      inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
299   } else {
300      /* Otherwise we always request to allocate a new VUE handle. If this is
301       * the last write before the EOT message and the new handle never gets
302       * used it will be dereferenced when we send the EOT message. This is
303       * necessary to avoid different setups for the EOT message (one for the
304       * case when there is no output and another for the case when there is)
305       * which would require to end the program with an IF/ELSE/ENDIF block,
306       * something we do not want.
307       */
308      inst = emit(GS_OPCODE_URB_WRITE_ALLOCATE);
309      inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
310      inst->dst = dst_reg(MRF, base_mrf);
311      inst->src[0] = this->temp;
312   }
313
314   inst->base_mrf = base_mrf;
315   inst->mlen = align_interleaved_urb_mlen(last_mrf - base_mrf);
316   inst->offset = urb_offset;
317}
318
319void
320gen6_gs_visitor::emit_thread_end()
321{
322   /* Make sure the current primitive is ended: we know it is not ended when
323    * first_vertex is not zero. This is only relevant for outputs other than
324    * points because in the point case we set PrimEnd on all vertices.
325    */
326   if (nir->info->gs.output_primitive != GL_POINTS) {
327      emit(CMP(dst_null_ud(), this->first_vertex, brw_imm_ud(0u), BRW_CONDITIONAL_Z));
328      emit(IF(BRW_PREDICATE_NORMAL));
329      gs_end_primitive();
330      emit(BRW_OPCODE_ENDIF);
331   }
332
333   /* Here we have to:
334    * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle.
335    * 2) Loop over all buffered vertex data and write it to corresponding
336    *    URB entries.
337    * 3) Allocate new VUE handles for all vertices other than the first.
338    * 4) Send a final EOT message.
339    */
340
341   /* MRF 0 is reserved for the debugger, so start with message header
342    * in MRF 1.
343    */
344   int base_mrf = 1;
345
346   /* In the process of generating our URB write message contents, we
347    * may need to unspill a register or load from an array.  Those
348    * reads would use MRFs 21..23
349    */
350   int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
351
352   /* Issue the FF_SYNC message and obtain the initial VUE handle. */
353   emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u), BRW_CONDITIONAL_G));
354   emit(IF(BRW_PREDICATE_NORMAL));
355   {
356      this->current_annotation = "gen6 thread end: ff_sync";
357
358      vec4_instruction *inst;
359      if (prog->info.has_transform_feedback_varyings) {
360         src_reg sol_temp(this, glsl_type::uvec4_type);
361         emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
362              dst_reg(this->svbi),
363              this->vertex_count,
364              this->prim_count,
365              sol_temp);
366         inst = emit(GS_OPCODE_FF_SYNC,
367                     dst_reg(this->temp), this->prim_count, this->svbi);
368      } else {
369         inst = emit(GS_OPCODE_FF_SYNC,
370                     dst_reg(this->temp), this->prim_count, brw_imm_ud(0u));
371      }
372      inst->base_mrf = base_mrf;
373
374      /* Loop over all buffered vertices and emit URB write messages */
375      this->current_annotation = "gen6 thread end: urb writes init";
376      src_reg vertex(this, glsl_type::uint_type);
377      emit(MOV(dst_reg(vertex), brw_imm_ud(0u)));
378      emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
379
380      this->current_annotation = "gen6 thread end: urb writes";
381      emit(BRW_OPCODE_DO);
382      {
383         emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
384         inst = emit(BRW_OPCODE_BREAK);
385         inst->predicate = BRW_PREDICATE_NORMAL;
386
387         /* First we prepare the message header */
388         emit_urb_write_header(base_mrf);
389
390         /* Then add vertex data to the message in interleaved fashion */
391         int slot = 0;
392         bool complete = false;
393         do {
394            int mrf = base_mrf + 1;
395
396            /* URB offset is in URB row increments, and each of our MRFs is half
397             * of one of those, since we're doing interleaved writes.
398             */
399            int urb_offset = slot / 2;
400
401            for (; slot < prog_data->vue_map.num_slots; ++slot) {
402               int varying = prog_data->vue_map.slot_to_varying[slot];
403               current_annotation = output_reg_annotation[varying];
404
405               /* Compute offset of this slot for the current vertex
406                * in vertex_output
407                */
408               src_reg data(this->vertex_output);
409               data.reladdr = ralloc(mem_ctx, src_reg);
410               memcpy(data.reladdr, &this->vertex_output_offset,
411                      sizeof(src_reg));
412
413               /* Copy this slot to the appropriate message register */
414               dst_reg reg = dst_reg(MRF, mrf);
415               reg.type = output_reg[varying][0].type;
416               data.type = reg.type;
417               vec4_instruction *inst = emit(MOV(reg, data));
418               inst->force_writemask_all = true;
419
420               mrf++;
421               emit(ADD(dst_reg(this->vertex_output_offset),
422                        this->vertex_output_offset, brw_imm_ud(1u)));
423
424               /* If this was max_usable_mrf, we can't fit anything more into
425                * this URB WRITE. Same if we reached the max. message length.
426                */
427               if (mrf > max_usable_mrf ||
428                   align_interleaved_urb_mlen(mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
429                  slot++;
430                  break;
431               }
432            }
433
434            complete = slot >= prog_data->vue_map.num_slots;
435            emit_urb_write_opcode(complete, base_mrf, mrf, urb_offset);
436         } while (!complete);
437
438         /* Skip over the flags data item so that vertex_output_offset points
439          * to the first data item of the next vertex, so that we can start
440          * writing the next vertex.
441          */
442         emit(ADD(dst_reg(this->vertex_output_offset),
443                  this->vertex_output_offset, brw_imm_ud(1u)));
444
445         emit(ADD(dst_reg(vertex), vertex, brw_imm_ud(1u)));
446      }
447      emit(BRW_OPCODE_WHILE);
448
449      if (prog->info.has_transform_feedback_varyings)
450         xfb_write();
451   }
452   emit(BRW_OPCODE_ENDIF);
453
454   /* Finally, emit EOT message.
455    *
456    * In gen6 we need to end the thread differently depending on whether we have
457    * emitted at least one vertex or not. In case we did, the EOT message must
458    * always include the COMPLETE flag or else the GPU hangs. If we have not
459    * produced any output we can't use the COMPLETE flag.
460    *
461    * However, this would lead us to end the program with an ENDIF opcode,
462    * which we want to avoid, so what we do is that we always request a new
463    * VUE handle every time we do a URB WRITE, even for the last vertex we emit.
464    * With this we make sure that whether we have emitted at least one vertex
465    * or none at all, we have to finish the thread without writing to the URB,
466    * which works for both cases by setting the COMPLETE and UNUSED flags in
467    * the EOT message.
468    */
469   this->current_annotation = "gen6 thread end: EOT";
470
471   if (prog->info.has_transform_feedback_varyings) {
472      /* When emitting EOT, set SONumPrimsWritten Increment Value. */
473      src_reg data(this, glsl_type::uint_type);
474      emit(AND(dst_reg(data), this->sol_prim_written, brw_imm_ud(0xffffu)));
475      emit(SHL(dst_reg(data), data, brw_imm_ud(16u)));
476      emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
477   }
478
479   vec4_instruction *inst = emit(GS_OPCODE_THREAD_END);
480   inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED;
481   inst->base_mrf = base_mrf;
482   inst->mlen = 1;
483}
484
485void
486gen6_gs_visitor::setup_payload()
487{
488   int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
489
490   /* Attributes are going to be interleaved, so one register contains two
491    * attribute slots.
492    */
493   int attributes_per_reg = 2;
494
495   /* If a geometry shader tries to read from an input that wasn't written by
496    * the vertex shader, that produces undefined results, but it shouldn't
497    * crash anything.  So initialize attribute_map to zeros--that ensures that
498    * these undefined results are read from r0.
499    */
500   memset(attribute_map, 0, sizeof(attribute_map));
501
502   int reg = 0;
503
504   /* The payload always contains important data in r0. */
505   reg++;
506
507   /* r1 is always part of the payload and it holds information relevant
508    * for transform feedback when we set the GEN6_GS_SVBI_PAYLOAD_ENABLE bit in
509    * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID
510    * information (and move the original value to a virtual register if
511    * necessary).
512    */
513   if (gs_prog_data->include_primitive_id)
514      attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg;
515   reg++;
516
517   reg = setup_uniforms(reg);
518
519   reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
520
521   lower_attributes_to_hw_regs(attribute_map, true);
522
523   this->first_non_payload_grf = reg;
524}
525
526void
527gen6_gs_visitor::xfb_setup()
528{
529   static const unsigned swizzle_for_offset[4] = {
530      BRW_SWIZZLE4(0, 1, 2, 3),
531      BRW_SWIZZLE4(1, 2, 3, 3),
532      BRW_SWIZZLE4(2, 3, 3, 3),
533      BRW_SWIZZLE4(3, 3, 3, 3)
534   };
535
536   const struct gl_transform_feedback_info *linked_xfb_info =
537      this->prog->sh.LinkedTransformFeedback;
538   int i;
539
540   /* Make sure that the VUE slots won't overflow the unsigned chars in
541    * prog_data->transform_feedback_bindings[].
542    */
543   STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256);
544
545   /* Make sure that we don't need more binding table entries than we've
546    * set aside for use in transform feedback.  (We shouldn't, since we
547    * set aside enough binding table entries to have one per component).
548    */
549   assert(linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS);
550
551   gs_prog_data->num_transform_feedback_bindings = linked_xfb_info->NumOutputs;
552   for (i = 0; i < gs_prog_data->num_transform_feedback_bindings; i++) {
553      gs_prog_data->transform_feedback_bindings[i] =
554         linked_xfb_info->Outputs[i].OutputRegister;
555      gs_prog_data->transform_feedback_swizzles[i] =
556         swizzle_for_offset[linked_xfb_info->Outputs[i].ComponentOffset];
557   }
558}
559
560void
561gen6_gs_visitor::xfb_write()
562{
563   unsigned num_verts;
564
565   if (!gs_prog_data->num_transform_feedback_bindings)
566      return;
567
568   switch (gs_prog_data->output_topology) {
569   case _3DPRIM_POINTLIST:
570      num_verts = 1;
571      break;
572   case _3DPRIM_LINELIST:
573   case _3DPRIM_LINESTRIP:
574   case _3DPRIM_LINELOOP:
575      num_verts = 2;
576      break;
577   case _3DPRIM_TRILIST:
578   case _3DPRIM_TRIFAN:
579   case _3DPRIM_TRISTRIP:
580   case _3DPRIM_RECTLIST:
581      num_verts = 3;
582      break;
583   case _3DPRIM_QUADLIST:
584   case _3DPRIM_QUADSTRIP:
585   case _3DPRIM_POLYGON:
586      num_verts = 3;
587      break;
588   default:
589      unreachable("Unexpected primitive type in Gen6 SOL program.");
590   }
591
592   this->current_annotation = "gen6 thread end: svb writes init";
593
594   emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
595   emit(MOV(dst_reg(this->sol_prim_written), brw_imm_ud(0u)));
596
597   /* Check that at least one primitive can be written
598    *
599    * Note: since we use the binding table to keep track of buffer offsets
600    * and stride, the GS doesn't need to keep track of a separate pointer
601    * into each buffer; it uses a single pointer which increments by 1 for
602    * each vertex.  So we use SVBI0 for this pointer, regardless of whether
603    * transform feedback is in interleaved or separate attribs mode.
604    */
605   src_reg sol_temp(this, glsl_type::uvec4_type);
606   emit(ADD(dst_reg(sol_temp), this->svbi, brw_imm_ud(num_verts)));
607
608   /* Compare SVBI calculated number with the maximum value, which is
609    * in R1.4 (previously saved in this->max_svbi) for gen6.
610    */
611   emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
612   emit(IF(BRW_PREDICATE_NORMAL));
613   {
614      vec4_instruction *inst = emit(MOV(dst_reg(destination_indices),
615                                        brw_imm_vf4(brw_float_to_vf(0.0),
616                                                    brw_float_to_vf(1.0),
617                                                    brw_float_to_vf(2.0),
618                                                    brw_float_to_vf(0.0))));
619      inst->force_writemask_all = true;
620
621      emit(ADD(dst_reg(this->destination_indices),
622               this->destination_indices,
623               this->svbi));
624   }
625   emit(BRW_OPCODE_ENDIF);
626
627   /* Write transform feedback data for all processed vertices. */
628   for (int i = 0; i < (int)nir->info->gs.vertices_out; i++) {
629      emit(MOV(dst_reg(sol_temp), brw_imm_d(i)));
630      emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
631               BRW_CONDITIONAL_L));
632      emit(IF(BRW_PREDICATE_NORMAL));
633      {
634         xfb_program(i, num_verts);
635      }
636      emit(BRW_OPCODE_ENDIF);
637   }
638}
639
640void
641gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
642{
643   unsigned binding;
644   unsigned num_bindings = gs_prog_data->num_transform_feedback_bindings;
645   src_reg sol_temp(this, glsl_type::uvec4_type);
646
647   /* Check for buffer overflow: we need room to write the complete primitive
648    * (all vertices). Otherwise, avoid writing any vertices for it
649    */
650   emit(ADD(dst_reg(sol_temp), this->sol_prim_written, brw_imm_ud(1u)));
651   emit(MUL(dst_reg(sol_temp), sol_temp, brw_imm_ud(num_verts)));
652   emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
653   emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
654   emit(IF(BRW_PREDICATE_NORMAL));
655   {
656      /* Avoid overwriting MRF 1 as it is used as URB write message header */
657      dst_reg mrf_reg(MRF, 2);
658
659      this->current_annotation = "gen6: emit SOL vertex data";
660      /* For each vertex, generate code to output each varying using the
661       * appropriate binding table entry.
662       */
663      for (binding = 0; binding < num_bindings; ++binding) {
664         unsigned char varying =
665            gs_prog_data->transform_feedback_bindings[binding];
666
667         /* Set up the correct destination index for this vertex */
668         vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
669                                       mrf_reg,
670                                       this->destination_indices);
671         inst->sol_vertex = vertex % num_verts;
672
673         /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
674          *
675          *   "Prior to End of Thread with a URB_WRITE, the kernel must
676          *   ensure that all writes are complete by sending the final
677          *   write as a committed write."
678          */
679         bool final_write = binding == (unsigned) num_bindings - 1 &&
680                            inst->sol_vertex == num_verts - 1;
681
682         /* Compute offset of this varying for the current vertex
683          * in vertex_output
684          */
685         this->current_annotation = output_reg_annotation[varying];
686         src_reg data(this->vertex_output);
687         data.reladdr = ralloc(mem_ctx, src_reg);
688         int offset = get_vertex_output_offset_for_varying(vertex, varying);
689         emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_d(offset)));
690         memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
691         data.type = output_reg[varying][0].type;
692
693         /* PSIZ, LAYER and VIEWPORT are packed in different channels of the
694          * same slot, so make sure we write the appropriate channel
695          */
696         if (varying == VARYING_SLOT_PSIZ)
697            data.swizzle = BRW_SWIZZLE_WWWW;
698         else if (varying == VARYING_SLOT_LAYER)
699            data.swizzle = BRW_SWIZZLE_YYYY;
700         else if (varying == VARYING_SLOT_VIEWPORT)
701            data.swizzle = BRW_SWIZZLE_ZZZZ;
702         else
703            data.swizzle = gs_prog_data->transform_feedback_swizzles[binding];
704
705         /* Write data */
706         inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
707         inst->sol_binding = binding;
708         inst->sol_final_write = final_write;
709
710         if (final_write) {
711            /* This is the last vertex of the primitive, then increment
712             * SO num primitive counter and destination indices.
713             */
714            emit(ADD(dst_reg(this->destination_indices),
715                     this->destination_indices,
716                     brw_imm_ud(num_verts)));
717            emit(ADD(dst_reg(this->sol_prim_written),
718                     this->sol_prim_written, brw_imm_ud(1u)));
719         }
720
721      }
722      this->current_annotation = NULL;
723   }
724   emit(BRW_OPCODE_ENDIF);
725}
726
727int
728gen6_gs_visitor::get_vertex_output_offset_for_varying(int vertex, int varying)
729{
730   /* Find the output slot assigned to this varying.
731    *
732    * VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot
733    * as VARYING_SLOT_PSIZ.
734    */
735   if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT)
736      varying = VARYING_SLOT_PSIZ;
737   int slot = prog_data->vue_map.varying_to_slot[varying];
738
739   if (slot < 0) {
740      /* This varying does not exist in the VUE so we are not writing to it
741       * and its value is undefined. We still want to return a valid offset
742       * into vertex_output though, to prevent any out-of-bound accesses into
743       * the vertex_output array. Since the value for this varying is undefined
744       * we don't really care for the value we assign to it, so any offset
745       * within the limits of vertex_output will do.
746       */
747      slot = 0;
748   }
749
750   return vertex * (prog_data->vue_map.num_slots + 1) + slot;
751}
752
753} /* namespace brw */
754