fd4_query.c revision e6bfe1c7734cfbf41a763797527db6cb49fa1566
1/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
2
3/*
4 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 *
25 * Authors:
26 *    Rob Clark <robclark@freedesktop.org>
27 */
28
29#include "freedreno_query_hw.h"
30#include "freedreno_context.h"
31#include "freedreno_util.h"
32
33#include "fd4_query.h"
34#include "fd4_context.h"
35#include "fd4_draw.h"
36#include "fd4_format.h"
37
38
39struct fd_rb_samp_ctrs {
40	uint64_t ctr[16];
41};
42
43/*
44 * Occlusion Query:
45 *
46 * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they
47 * interpret results
48 */
49
50static struct fd_hw_sample *
51occlusion_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring)
52{
53	struct fd_hw_sample *samp =
54			fd_hw_sample_init(batch, sizeof(struct fd_rb_samp_ctrs));
55
56	/* low bits of sample addr should be zero (since they are control
57	 * flags in RB_SAMPLE_COUNT_CONTROL):
58	 */
59	debug_assert((samp->offset & 0x3) == 0);
60
61	/* Set RB_SAMPLE_COUNT_ADDR to samp->offset plus value of
62	 * HW_QUERY_BASE_REG register:
63	 */
64	OUT_PKT3(ring, CP_SET_CONSTANT, 3);
65	OUT_RING(ring, CP_REG(REG_A4XX_RB_SAMPLE_COUNT_CONTROL) | 0x80000000);
66	OUT_RING(ring, HW_QUERY_BASE_REG);
67	OUT_RING(ring, A4XX_RB_SAMPLE_COUNT_CONTROL_COPY |
68			samp->offset);
69
70	OUT_PKT3(ring, CP_DRAW_INDX_OFFSET, 3);
71	OUT_RING(ring, DRAW4(DI_PT_POINTLIST_PSIZE, DI_SRC_SEL_AUTO_INDEX,
72						INDEX4_SIZE_32_BIT, USE_VISIBILITY));
73	OUT_RING(ring, 1);             /* NumInstances */
74	OUT_RING(ring, 0);             /* NumIndices */
75
76	fd_event_write(batch, ring, ZPASS_DONE);
77
78	return samp;
79}
80
81static uint64_t
82count_samples(const struct fd_rb_samp_ctrs *start,
83		const struct fd_rb_samp_ctrs *end)
84{
85	return end->ctr[0] - start->ctr[0];
86}
87
88static void
89occlusion_counter_accumulate_result(struct fd_context *ctx,
90		const void *start, const void *end,
91		union pipe_query_result *result)
92{
93	uint64_t n = count_samples(start, end);
94	result->u64 += n;
95}
96
97static void
98occlusion_predicate_accumulate_result(struct fd_context *ctx,
99		const void *start, const void *end,
100		union pipe_query_result *result)
101{
102	uint64_t n = count_samples(start, end);
103	result->b |= (n > 0);
104}
105
106/*
107 * Time Elapsed Query:
108 *
109 * Note: we could in theory support timestamp queries, but they
110 * won't give sensible results for tilers.
111 */
112
113static void
114time_elapsed_enable(struct fd_context *ctx, struct fd_ringbuffer *ring)
115{
116	/* Right now, the assignment of countable to counter register is
117	 * just hard coded.  If we start exposing more countables than we
118	 * have counters, we will need to be more clever.
119	 */
120	fd_wfi(ctx->batch, ring);
121	OUT_PKT0(ring, REG_A4XX_CP_PERFCTR_CP_SEL_0, 1);
122	OUT_RING(ring, CP_ALWAYS_COUNT);
123}
124
125static struct fd_hw_sample *
126time_elapsed_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring)
127{
128	struct fd_hw_sample *samp = fd_hw_sample_init(batch, sizeof(uint64_t));
129
130	/* use unused part of vsc_size_mem as scratch space, to avoid
131	 * extra allocation:
132	 */
133	struct fd_bo *scratch_bo = fd4_context(batch->ctx)->vsc_size_mem;
134	const int sample_off = 128;
135	const int addr_off = sample_off + 8;
136
137	debug_assert(batch->ctx->screen->max_freq > 0);
138
139	/* Basic issue is that we need to read counter value to a relative
140	 * destination (with per-tile offset) rather than absolute dest
141	 * addr.  But there is no pm4 packet that can do that.  This is
142	 * where it would be *really* nice if we could write our own fw
143	 * since afaict implementing the sort of packet we need would be
144	 * trivial.
145	 *
146	 * Instead, we:
147	 * (1) CP_REG_TO_MEM to do a 64b copy of counter to scratch buffer
148	 * (2) CP_MEM_WRITE to write per-sample offset to scratch buffer
149	 * (3) CP_REG_TO_MEM w/ accumulate flag to add the per-tile base
150	 *     address to the per-sample offset in the scratch buffer
151	 * (4) CP_MEM_TO_REG to copy resulting address from steps #2 and #3
152	 *     to CP_ME_NRT_ADDR
153	 * (5) CP_MEM_TO_REG's to copy saved counter value from scratch
154	 *     buffer to CP_ME_NRT_DATA to trigger the write out to query
155	 *     result buffer
156	 *
157	 * Straightforward, right?
158	 *
159	 * Maybe could swap the order of things in the scratch buffer to
160	 * put address first, and copy back to CP_ME_NRT_ADDR+DATA in one
161	 * shot, but that's really just polishing a turd..
162	 */
163
164	fd_wfi(batch, ring);
165
166	/* copy sample counter _LO and _HI to scratch: */
167	OUT_PKT3(ring, CP_REG_TO_MEM, 2);
168	OUT_RING(ring, CP_REG_TO_MEM_0_REG(REG_A4XX_RBBM_PERFCTR_CP_0_LO) |
169			CP_REG_TO_MEM_0_64B |
170			CP_REG_TO_MEM_0_CNT(2-1)); /* write 2 regs to mem */
171	OUT_RELOCW(ring, scratch_bo, sample_off, 0, 0);
172
173	/* ok... here we really *would* like to use the CP_SET_CONSTANT
174	 * mode which can add a constant to value in reg2 and write to
175	 * reg1... *but* that only works for banked/context registers,
176	 * and CP_ME_NRT_DATA isn't one of those.. so we need to do some
177	 * CP math to the scratch buffer instead:
178	 *
179	 * (note first 8 bytes are counter value, use offset 0x8 for
180	 * address calculation)
181	 */
182
183	/* per-sample offset to scratch bo: */
184	OUT_PKT3(ring, CP_MEM_WRITE, 2);
185	OUT_RELOCW(ring, scratch_bo, addr_off, 0, 0);
186	OUT_RING(ring, samp->offset);
187
188	/* now add to that the per-tile base: */
189	OUT_PKT3(ring, CP_REG_TO_MEM, 2);
190	OUT_RING(ring, CP_REG_TO_MEM_0_REG(HW_QUERY_BASE_REG) |
191			CP_REG_TO_MEM_0_ACCUMULATE |
192			CP_REG_TO_MEM_0_CNT(1-1));       /* readback 1 regs */
193	OUT_RELOCW(ring, scratch_bo, addr_off, 0, 0);
194
195	/* now copy that back to CP_ME_NRT_ADDR: */
196	OUT_PKT3(ring, CP_MEM_TO_REG, 2);
197	OUT_RING(ring, REG_A4XX_CP_ME_NRT_ADDR);
198	OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
199
200	/* and finally, copy sample from scratch buffer to CP_ME_NRT_DATA
201	 * to trigger the write to result buffer
202	 */
203	OUT_PKT3(ring, CP_MEM_TO_REG, 2);
204	OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA);
205	OUT_RELOC(ring, scratch_bo, sample_off, 0, 0);
206
207	/* and again to get the value of the _HI reg from scratch: */
208	OUT_PKT3(ring, CP_MEM_TO_REG, 2);
209	OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA);
210	OUT_RELOC(ring, scratch_bo, sample_off + 0x4, 0, 0);
211
212	/* Sigh.. */
213
214	return samp;
215}
216
217static void
218time_elapsed_accumulate_result(struct fd_context *ctx,
219		const void *start, const void *end,
220		union pipe_query_result *result)
221{
222	uint64_t n = *(uint64_t *)end - *(uint64_t *)start;
223	/* max_freq is in Hz, convert cycle count to ns: */
224	result->u64 += n * 1000000000 / ctx->screen->max_freq;
225}
226
227static void
228timestamp_accumulate_result(struct fd_context *ctx,
229		const void *start, const void *end,
230		union pipe_query_result *result)
231{
232	/* just return the value from fist tile: */
233	if (result->u64 != 0)
234		return;
235	uint64_t n = *(uint64_t *)start;
236	/* max_freq is in Hz, convert cycle count to ns: */
237	result->u64 = n * 1000000000 / ctx->screen->max_freq;
238}
239
240static const struct fd_hw_sample_provider occlusion_counter = {
241		.query_type = PIPE_QUERY_OCCLUSION_COUNTER,
242		.active = FD_STAGE_DRAW,
243		.get_sample = occlusion_get_sample,
244		.accumulate_result = occlusion_counter_accumulate_result,
245};
246
247static const struct fd_hw_sample_provider occlusion_predicate = {
248		.query_type = PIPE_QUERY_OCCLUSION_PREDICATE,
249		.active = FD_STAGE_DRAW,
250		.get_sample = occlusion_get_sample,
251		.accumulate_result = occlusion_predicate_accumulate_result,
252};
253
254static const struct fd_hw_sample_provider time_elapsed = {
255		.query_type = PIPE_QUERY_TIME_ELAPSED,
256		.active = FD_STAGE_DRAW | FD_STAGE_CLEAR,
257		.enable = time_elapsed_enable,
258		.get_sample = time_elapsed_get_sample,
259		.accumulate_result = time_elapsed_accumulate_result,
260};
261
262/* NOTE: timestamp query isn't going to give terribly sensible results
263 * on a tiler.  But it is needed by qapitrace profile heatmap.  If you
264 * add in a binning pass, the results get even more non-sensical.  So
265 * we just return the timestamp on the first tile and hope that is
266 * kind of good enough.
267 */
268static const struct fd_hw_sample_provider timestamp = {
269		.query_type = PIPE_QUERY_TIMESTAMP,
270		.active = FD_STAGE_ALL,
271		.enable = time_elapsed_enable,
272		.get_sample = time_elapsed_get_sample,
273		.accumulate_result = timestamp_accumulate_result,
274};
275
276void fd4_query_context_init(struct pipe_context *pctx)
277{
278	fd_hw_query_register_provider(pctx, &occlusion_counter);
279	fd_hw_query_register_provider(pctx, &occlusion_predicate);
280	fd_hw_query_register_provider(pctx, &time_elapsed);
281	fd_hw_query_register_provider(pctx, &timestamp);
282}
283