1/*
2 * Copyright © 2016 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24/**
25 * An implementation of the transform feedback driver hooks for Haswell
26 * and later hardware.  This uses MI_MATH to compute the number of vertices
27 * written (for use by DrawTransformFeedback()) without any CPU<->GPU
28 * synchronization which could stall.
29 */
30
31#include "brw_context.h"
32#include "brw_state.h"
33#include "brw_defines.h"
34#include "intel_batchbuffer.h"
35#include "intel_buffer_objects.h"
36#include "main/transformfeedback.h"
37
38/**
39 * We store several values in obj->prim_count_bo:
40 *
41 * [4x 32-bit values]: Final Number of Vertices Written
42 * [4x 32-bit values]: Tally of Primitives Written So Far
43 * [4x 64-bit values]: Starting SO_NUM_PRIMS_WRITTEN Counter Snapshots
44 *
45 * The first set of values is used by DrawTransformFeedback(), which
46 * copies one of them into the 3DPRIM_VERTEX_COUNT register and performs
47 * an indirect draw.  The other values are just temporary storage.
48 */
49
50#define TALLY_OFFSET (BRW_MAX_XFB_STREAMS * sizeof(uint32_t))
51#define START_OFFSET (TALLY_OFFSET * 2)
52
53/**
54 * Store the SO_NUM_PRIMS_WRITTEN counters for each stream (4 uint64_t values)
55 * to prim_count_bo.
56 */
57static void
58save_prim_start_values(struct brw_context *brw,
59                       struct brw_transform_feedback_object *obj)
60{
61   /* Flush any drawing so that the counters have the right values. */
62   brw_emit_mi_flush(brw);
63
64   /* Emit MI_STORE_REGISTER_MEM commands to write the values. */
65   for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) {
66      brw_store_register_mem64(brw, obj->prim_count_bo,
67                               GEN7_SO_NUM_PRIMS_WRITTEN(i),
68                               START_OFFSET + i * sizeof(uint64_t));
69   }
70}
71
72/**
73 * Compute the number of primitives written during our most recent
74 * transform feedback activity (the current SO_NUM_PRIMS_WRITTEN value
75 * minus the stashed "start" value), and add it to our running tally.
76 *
77 * If \p finalize is true, also compute the number of vertices written
78 * (by multiplying by the number of vertices per primitive), and store
79 * that to the "final" location.
80 *
81 * Otherwise, just overwrite the old tally with the new one.
82 */
83static void
84tally_prims_written(struct brw_context *brw,
85                    struct brw_transform_feedback_object *obj,
86                    bool finalize)
87{
88   /* Flush any drawing so that the counters have the right values. */
89   brw_emit_mi_flush(brw);
90
91   for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) {
92      /* GPR0 = Tally */
93      brw_load_register_imm32(brw, HSW_CS_GPR(0) + 4, 0);
94      brw_load_register_mem(brw, HSW_CS_GPR(0), obj->prim_count_bo,
95                            I915_GEM_DOMAIN_INSTRUCTION,
96                            I915_GEM_DOMAIN_INSTRUCTION,
97                            TALLY_OFFSET + i * sizeof(uint32_t));
98      if (!obj->base.Paused) {
99         /* GPR1 = Start Snapshot */
100         brw_load_register_mem64(brw, HSW_CS_GPR(1), obj->prim_count_bo,
101                                 I915_GEM_DOMAIN_INSTRUCTION,
102                                 I915_GEM_DOMAIN_INSTRUCTION,
103                                 START_OFFSET + i * sizeof(uint64_t));
104         /* GPR2 = Ending Snapshot */
105         brw_load_register_reg64(brw, GEN7_SO_NUM_PRIMS_WRITTEN(i), HSW_CS_GPR(2));
106
107         BEGIN_BATCH(9);
108         OUT_BATCH(HSW_MI_MATH | (9 - 2));
109         /* GPR1 = GPR2 (End) - GPR1 (Start) */
110         OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R2));
111         OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R1));
112         OUT_BATCH(MI_MATH_ALU0(SUB));
113         OUT_BATCH(MI_MATH_ALU2(STORE, R1, ACCU));
114         /* GPR0 = GPR0 (Tally) + GPR1 (Diff) */
115         OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0));
116         OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R1));
117            OUT_BATCH(MI_MATH_ALU0(ADD));
118         OUT_BATCH(MI_MATH_ALU2(STORE, R0, ACCU));
119         ADVANCE_BATCH();
120      }
121
122      if (!finalize) {
123         /* Write back the new tally */
124         brw_store_register_mem32(brw, obj->prim_count_bo, HSW_CS_GPR(0),
125                                  TALLY_OFFSET + i * sizeof(uint32_t));
126      } else {
127         /* Convert the number of primitives to the number of vertices. */
128         if (obj->primitive_mode == GL_LINES) {
129            /* Double R0 (R0 = R0 + R0) */
130            BEGIN_BATCH(5);
131            OUT_BATCH(HSW_MI_MATH | (5 - 2));
132            OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0));
133            OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R0));
134            OUT_BATCH(MI_MATH_ALU0(ADD));
135            OUT_BATCH(MI_MATH_ALU2(STORE, R0, ACCU));
136            ADVANCE_BATCH();
137         } else if (obj->primitive_mode == GL_TRIANGLES) {
138            /* Triple R0 (R1 = R0 + R0, R0 = R0 + R1) */
139            BEGIN_BATCH(9);
140            OUT_BATCH(HSW_MI_MATH | (9 - 2));
141            OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0));
142            OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R0));
143            OUT_BATCH(MI_MATH_ALU0(ADD));
144            OUT_BATCH(MI_MATH_ALU2(STORE, R1, ACCU));
145            OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0));
146            OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R1));
147            OUT_BATCH(MI_MATH_ALU0(ADD));
148            OUT_BATCH(MI_MATH_ALU2(STORE, R0, ACCU));
149            ADVANCE_BATCH();
150         }
151         /* Store it to the final result */
152         brw_store_register_mem32(brw, obj->prim_count_bo, HSW_CS_GPR(0),
153                                  i * sizeof(uint32_t));
154      }
155   }
156}
157
158/**
159 * BeginTransformFeedback() driver hook.
160 */
161void
162hsw_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
163                              struct gl_transform_feedback_object *obj)
164{
165   struct brw_context *brw = brw_context(ctx);
166   struct brw_transform_feedback_object *brw_obj =
167      (struct brw_transform_feedback_object *) obj;
168
169   brw_obj->primitive_mode = mode;
170
171   /* Reset the SO buffer offsets to 0. */
172   if (brw->gen >= 8) {
173      brw_obj->zero_offsets = true;
174   } else {
175      BEGIN_BATCH(1 + 2 * BRW_MAX_XFB_STREAMS);
176      OUT_BATCH(MI_LOAD_REGISTER_IMM | (1 + 2 * BRW_MAX_XFB_STREAMS - 2));
177      for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) {
178         OUT_BATCH(GEN7_SO_WRITE_OFFSET(i));
179         OUT_BATCH(0);
180      }
181      ADVANCE_BATCH();
182   }
183
184   /* Zero out the initial tallies */
185   brw_store_data_imm64(brw, brw_obj->prim_count_bo, TALLY_OFFSET,     0ull);
186   brw_store_data_imm64(brw, brw_obj->prim_count_bo, TALLY_OFFSET + 8, 0ull);
187
188   /* Store the new starting value of the SO_NUM_PRIMS_WRITTEN counters. */
189   save_prim_start_values(brw, brw_obj);
190}
191
192/**
193 * PauseTransformFeedback() driver hook.
194 */
195void
196hsw_pause_transform_feedback(struct gl_context *ctx,
197                              struct gl_transform_feedback_object *obj)
198{
199   struct brw_context *brw = brw_context(ctx);
200   struct brw_transform_feedback_object *brw_obj =
201      (struct brw_transform_feedback_object *) obj;
202
203   if (brw->is_haswell) {
204      /* Flush any drawing so that the counters have the right values. */
205      brw_emit_mi_flush(brw);
206
207      /* Save the SOL buffer offset register values. */
208      for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) {
209         BEGIN_BATCH(3);
210         OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
211         OUT_BATCH(GEN7_SO_WRITE_OFFSET(i));
212         OUT_RELOC(brw_obj->offset_bo,
213                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
214                   i * sizeof(uint32_t));
215         ADVANCE_BATCH();
216      }
217   }
218
219   /* Add any primitives written to our tally */
220   tally_prims_written(brw, brw_obj, false);
221}
222
223/**
224 * ResumeTransformFeedback() driver hook.
225 */
226void
227hsw_resume_transform_feedback(struct gl_context *ctx,
228                               struct gl_transform_feedback_object *obj)
229{
230   struct brw_context *brw = brw_context(ctx);
231   struct brw_transform_feedback_object *brw_obj =
232      (struct brw_transform_feedback_object *) obj;
233
234   if (brw->is_haswell) {
235      /* Reload the SOL buffer offset registers. */
236      for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) {
237         BEGIN_BATCH(3);
238         OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
239         OUT_BATCH(GEN7_SO_WRITE_OFFSET(i));
240         OUT_RELOC(brw_obj->offset_bo,
241                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
242                   i * sizeof(uint32_t));
243         ADVANCE_BATCH();
244      }
245   }
246
247   /* Store the new starting value of the SO_NUM_PRIMS_WRITTEN counters. */
248   save_prim_start_values(brw, brw_obj);
249}
250
251/**
252 * EndTransformFeedback() driver hook.
253 */
254void
255hsw_end_transform_feedback(struct gl_context *ctx,
256			    struct gl_transform_feedback_object *obj)
257{
258   struct brw_context *brw = brw_context(ctx);
259   struct brw_transform_feedback_object *brw_obj =
260      (struct brw_transform_feedback_object *) obj;
261
262   /* Add any primitives written to our tally, convert it from the number
263    * of primitives written to the number of vertices written, and store
264    * it in the "final" location in the buffer which DrawTransformFeedback()
265    * will use as the vertex count.
266    */
267   tally_prims_written(brw, brw_obj, true);
268}
269