1/*
2 * Copyright 2015 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 *  Nicolai Hähnle <nicolai.haehnle@amd.com>
25 *
26 */
27
28#include "radeon/r600_cs.h"
29#include "radeon/r600_query.h"
30#include "util/u_memory.h"
31
32#include "si_pipe.h"
33#include "sid.h"
34
35enum si_pc_reg_layout {
36	/* All secondary selector dwords follow as one block after the primary
37	 * selector dwords for the counters that have secondary selectors.
38	 */
39	SI_PC_MULTI_BLOCK = 0,
40
41	/* Each secondary selector dword follows immediately afters the
42	 * corresponding primary.
43	 */
44	SI_PC_MULTI_ALTERNATE = 1,
45
46	/* All secondary selector dwords follow as one block after all primary
47	 * selector dwords.
48	 */
49	SI_PC_MULTI_TAIL = 2,
50
51	/* Free-form arrangement of selector registers. */
52	SI_PC_MULTI_CUSTOM = 3,
53
54	SI_PC_MULTI_MASK = 3,
55
56	/* Registers are laid out in decreasing rather than increasing order. */
57	SI_PC_REG_REVERSE = 4,
58
59	SI_PC_FAKE = 8,
60};
61
62struct si_pc_block_base {
63	const char *name;
64	unsigned num_counters;
65	unsigned flags;
66
67	unsigned select_or;
68	unsigned select0;
69	unsigned counter0_lo;
70	unsigned *select;
71	unsigned *counters;
72	unsigned num_multi;
73	unsigned num_prelude;
74	unsigned layout;
75};
76
77struct si_pc_block {
78	struct si_pc_block_base *b;
79	unsigned selectors;
80	unsigned instances;
81};
82
83/* The order is chosen to be compatible with GPUPerfStudio's hardcoding of
84 * performance counter group IDs.
85 */
86static const char * const si_pc_shader_type_suffixes[] = {
87	"", "_ES", "_GS", "_VS", "_PS", "_LS", "_HS", "_CS"
88};
89
90static const unsigned si_pc_shader_type_bits[] = {
91	0x7f,
92	S_036780_ES_EN(1),
93	S_036780_GS_EN(1),
94	S_036780_VS_EN(1),
95	S_036780_PS_EN(1),
96	S_036780_LS_EN(1),
97	S_036780_HS_EN(1),
98	S_036780_CS_EN(1),
99};
100
101static struct si_pc_block_base cik_CB = {
102	.name = "CB",
103	.num_counters = 4,
104	.flags = R600_PC_BLOCK_SE | R600_PC_BLOCK_INSTANCE_GROUPS,
105
106	.select0 = R_037000_CB_PERFCOUNTER_FILTER,
107	.counter0_lo = R_035018_CB_PERFCOUNTER0_LO,
108	.num_multi = 1,
109	.num_prelude = 1,
110	.layout = SI_PC_MULTI_ALTERNATE,
111};
112
113static unsigned cik_CPC_select[] = {
114	R_036024_CPC_PERFCOUNTER0_SELECT,
115	R_036010_CPC_PERFCOUNTER0_SELECT1,
116	R_03600C_CPC_PERFCOUNTER1_SELECT,
117};
118static struct si_pc_block_base cik_CPC = {
119	.name = "CPC",
120	.num_counters = 2,
121
122	.select = cik_CPC_select,
123	.counter0_lo = R_034018_CPC_PERFCOUNTER0_LO,
124	.num_multi = 1,
125	.layout = SI_PC_MULTI_CUSTOM | SI_PC_REG_REVERSE,
126};
127
128static struct si_pc_block_base cik_CPF = {
129	.name = "CPF",
130	.num_counters = 2,
131
132	.select0 = R_03601C_CPF_PERFCOUNTER0_SELECT,
133	.counter0_lo = R_034028_CPF_PERFCOUNTER0_LO,
134	.num_multi = 1,
135	.layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
136};
137
138static struct si_pc_block_base cik_CPG = {
139	.name = "CPG",
140	.num_counters = 2,
141
142	.select0 = R_036008_CPG_PERFCOUNTER0_SELECT,
143	.counter0_lo = R_034008_CPG_PERFCOUNTER0_LO,
144	.num_multi = 1,
145	.layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
146};
147
148static struct si_pc_block_base cik_DB = {
149	.name = "DB",
150	.num_counters = 4,
151	.flags = R600_PC_BLOCK_SE | R600_PC_BLOCK_INSTANCE_GROUPS,
152
153	.select0 = R_037100_DB_PERFCOUNTER0_SELECT,
154	.counter0_lo = R_035100_DB_PERFCOUNTER0_LO,
155	.num_multi = 3, // really only 2, but there's a gap between registers
156	.layout = SI_PC_MULTI_ALTERNATE,
157};
158
159static struct si_pc_block_base cik_GDS = {
160	.name = "GDS",
161	.num_counters = 4,
162
163	.select0 = R_036A00_GDS_PERFCOUNTER0_SELECT,
164	.counter0_lo = R_034A00_GDS_PERFCOUNTER0_LO,
165	.num_multi = 1,
166	.layout = SI_PC_MULTI_TAIL,
167};
168
169static unsigned cik_GRBM_counters[] = {
170	R_034100_GRBM_PERFCOUNTER0_LO,
171	R_03410C_GRBM_PERFCOUNTER1_LO,
172};
173static struct si_pc_block_base cik_GRBM = {
174	.name = "GRBM",
175	.num_counters = 2,
176
177	.select0 = R_036100_GRBM_PERFCOUNTER0_SELECT,
178	.counters = cik_GRBM_counters,
179};
180
181static struct si_pc_block_base cik_GRBMSE = {
182	.name = "GRBMSE",
183	.num_counters = 4,
184
185	.select0 = R_036108_GRBM_SE0_PERFCOUNTER_SELECT,
186	.counter0_lo = R_034114_GRBM_SE0_PERFCOUNTER_LO,
187};
188
189static struct si_pc_block_base cik_IA = {
190	.name = "IA",
191	.num_counters = 4,
192
193	.select0 = R_036210_IA_PERFCOUNTER0_SELECT,
194	.counter0_lo = R_034220_IA_PERFCOUNTER0_LO,
195	.num_multi = 1,
196	.layout = SI_PC_MULTI_TAIL,
197};
198
199static struct si_pc_block_base cik_PA_SC = {
200	.name = "PA_SC",
201	.num_counters = 8,
202	.flags = R600_PC_BLOCK_SE,
203
204	.select0 = R_036500_PA_SC_PERFCOUNTER0_SELECT,
205	.counter0_lo = R_034500_PA_SC_PERFCOUNTER0_LO,
206	.num_multi = 1,
207	.layout = SI_PC_MULTI_ALTERNATE,
208};
209
210/* According to docs, PA_SU counters are only 48 bits wide. */
211static struct si_pc_block_base cik_PA_SU = {
212	.name = "PA_SU",
213	.num_counters = 4,
214	.flags = R600_PC_BLOCK_SE,
215
216	.select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT,
217	.counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO,
218	.num_multi = 2,
219	.layout = SI_PC_MULTI_ALTERNATE,
220};
221
222static struct si_pc_block_base cik_SPI = {
223	.name = "SPI",
224	.num_counters = 6,
225	.flags = R600_PC_BLOCK_SE,
226
227	.select0 = R_036600_SPI_PERFCOUNTER0_SELECT,
228	.counter0_lo = R_034604_SPI_PERFCOUNTER0_LO,
229	.num_multi = 4,
230	.layout = SI_PC_MULTI_BLOCK,
231};
232
233static struct si_pc_block_base cik_SQ = {
234	.name = "SQ",
235	.num_counters = 16,
236	.flags = R600_PC_BLOCK_SE | R600_PC_BLOCK_SHADER,
237
238	.select0 = R_036700_SQ_PERFCOUNTER0_SELECT,
239	.select_or = S_036700_SQC_BANK_MASK(15) |
240			S_036700_SQC_CLIENT_MASK(15) |
241			S_036700_SIMD_MASK(15),
242	.counter0_lo = R_034700_SQ_PERFCOUNTER0_LO,
243};
244
245static struct si_pc_block_base cik_SX = {
246	.name = "SX",
247	.num_counters = 4,
248	.flags = R600_PC_BLOCK_SE,
249
250	.select0 = R_036900_SX_PERFCOUNTER0_SELECT,
251	.counter0_lo = R_034900_SX_PERFCOUNTER0_LO,
252	.num_multi = 2,
253	.layout = SI_PC_MULTI_TAIL,
254};
255
256static struct si_pc_block_base cik_TA = {
257	.name = "TA",
258	.num_counters = 2,
259	.flags = R600_PC_BLOCK_SE | R600_PC_BLOCK_INSTANCE_GROUPS | R600_PC_BLOCK_SHADER_WINDOWED,
260
261	.select0 = R_036B00_TA_PERFCOUNTER0_SELECT,
262	.counter0_lo = R_034B00_TA_PERFCOUNTER0_LO,
263	.num_multi = 1,
264	.layout = SI_PC_MULTI_ALTERNATE,
265};
266
267static struct si_pc_block_base cik_TD = {
268	.name = "TD",
269	.num_counters = 2,
270	.flags = R600_PC_BLOCK_SE | R600_PC_BLOCK_INSTANCE_GROUPS | R600_PC_BLOCK_SHADER_WINDOWED,
271
272	.select0 = R_036C00_TD_PERFCOUNTER0_SELECT,
273	.counter0_lo = R_034C00_TD_PERFCOUNTER0_LO,
274	.num_multi = 1,
275	.layout = SI_PC_MULTI_ALTERNATE,
276};
277
278static struct si_pc_block_base cik_TCA = {
279	.name = "TCA",
280	.num_counters = 4,
281	.flags = R600_PC_BLOCK_INSTANCE_GROUPS,
282
283	.select0 = R_036E40_TCA_PERFCOUNTER0_SELECT,
284	.counter0_lo = R_034E40_TCA_PERFCOUNTER0_LO,
285	.num_multi = 2,
286	.layout = SI_PC_MULTI_ALTERNATE,
287};
288
289static struct si_pc_block_base cik_TCC = {
290	.name = "TCC",
291	.num_counters = 4,
292	.flags = R600_PC_BLOCK_INSTANCE_GROUPS,
293
294	.select0 = R_036E00_TCC_PERFCOUNTER0_SELECT,
295	.counter0_lo = R_034E00_TCC_PERFCOUNTER0_LO,
296	.num_multi = 2,
297	.layout = SI_PC_MULTI_ALTERNATE,
298};
299
300static struct si_pc_block_base cik_TCP = {
301	.name = "TCP",
302	.num_counters = 4,
303	.flags = R600_PC_BLOCK_SE | R600_PC_BLOCK_INSTANCE_GROUPS | R600_PC_BLOCK_SHADER_WINDOWED,
304
305	.select0 = R_036D00_TCP_PERFCOUNTER0_SELECT,
306	.counter0_lo = R_034D00_TCP_PERFCOUNTER0_LO,
307	.num_multi = 2,
308	.layout = SI_PC_MULTI_ALTERNATE,
309};
310
311static struct si_pc_block_base cik_VGT = {
312	.name = "VGT",
313	.num_counters = 4,
314	.flags = R600_PC_BLOCK_SE,
315
316	.select0 = R_036230_VGT_PERFCOUNTER0_SELECT,
317	.counter0_lo = R_034240_VGT_PERFCOUNTER0_LO,
318	.num_multi = 1,
319	.layout = SI_PC_MULTI_TAIL,
320};
321
322static struct si_pc_block_base cik_WD = {
323	.name = "WD",
324	.num_counters = 4,
325
326	.select0 = R_036200_WD_PERFCOUNTER0_SELECT,
327	.counter0_lo = R_034200_WD_PERFCOUNTER0_LO,
328};
329
330static struct si_pc_block_base cik_MC = {
331	.name = "MC",
332	.num_counters = 4,
333
334	.layout = SI_PC_FAKE,
335};
336
337static struct si_pc_block_base cik_SRBM = {
338	.name = "SRBM",
339	.num_counters = 2,
340
341	.layout = SI_PC_FAKE,
342};
343
344/* Both the number of instances and selectors varies between chips of the same
345 * class. We only differentiate by class here and simply expose the maximum
346 * number over all chips in a class.
347 *
348 * Unfortunately, GPUPerfStudio uses the order of performance counter groups
349 * blindly once it believes it has identified the hardware, so the order of
350 * blocks here matters.
351 */
352static struct si_pc_block groups_CIK[] = {
353	{ &cik_CB, 226, 4 },
354	{ &cik_CPF, 17 },
355	{ &cik_DB, 257, 4 },
356	{ &cik_GRBM, 34 },
357	{ &cik_GRBMSE, 15 },
358	{ &cik_PA_SU, 153 },
359	{ &cik_PA_SC, 395 },
360	{ &cik_SPI, 186 },
361	{ &cik_SQ, 252 },
362	{ &cik_SX, 32 },
363	{ &cik_TA, 111, 11 },
364	{ &cik_TCA, 39, 2 },
365	{ &cik_TCC, 160, 16 },
366	{ &cik_TD, 55, 11 },
367	{ &cik_TCP, 154, 11 },
368	{ &cik_GDS, 121 },
369	{ &cik_VGT, 140 },
370	{ &cik_IA, 22 },
371	{ &cik_MC, 22 },
372	{ &cik_SRBM, 19 },
373	{ &cik_WD, 22 },
374	{ &cik_CPG, 46 },
375	{ &cik_CPC, 22 },
376
377};
378
379static struct si_pc_block groups_VI[] = {
380	{ &cik_CB, 396, 4 },
381	{ &cik_CPF, 19 },
382	{ &cik_DB, 257, 4 },
383	{ &cik_GRBM, 34 },
384	{ &cik_GRBMSE, 15 },
385	{ &cik_PA_SU, 153 },
386	{ &cik_PA_SC, 397 },
387	{ &cik_SPI, 197 },
388	{ &cik_SQ, 273 },
389	{ &cik_SX, 34 },
390	{ &cik_TA, 119, 16 },
391	{ &cik_TCA, 35, 2 },
392	{ &cik_TCC, 192, 16 },
393	{ &cik_TD, 55, 16 },
394	{ &cik_TCP, 180, 16 },
395	{ &cik_GDS, 121 },
396	{ &cik_VGT, 147 },
397	{ &cik_IA, 24 },
398	{ &cik_MC, 22 },
399	{ &cik_SRBM, 27 },
400	{ &cik_WD, 37 },
401	{ &cik_CPG, 48 },
402	{ &cik_CPC, 24 },
403
404};
405
406static void si_pc_get_size(struct r600_perfcounter_block *group,
407			unsigned count, unsigned *selectors,
408			unsigned *num_select_dw, unsigned *num_read_dw)
409{
410	struct si_pc_block *sigroup = (struct si_pc_block *)group->data;
411	struct si_pc_block_base *regs = sigroup->b;
412	unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
413
414	if (regs->layout & SI_PC_FAKE) {
415		*num_select_dw = 0;
416	} else if (layout_multi == SI_PC_MULTI_BLOCK) {
417		if (count < regs->num_multi)
418			*num_select_dw = 2 * (count + 2) + regs->num_prelude;
419		else
420			*num_select_dw = 2 + count + regs->num_multi + regs->num_prelude;
421	} else if (layout_multi == SI_PC_MULTI_TAIL) {
422		*num_select_dw = 4 + count + MIN2(count, regs->num_multi) + regs->num_prelude;
423	} else if (layout_multi == SI_PC_MULTI_CUSTOM) {
424		assert(regs->num_prelude == 0);
425		*num_select_dw = 3 * (count + MIN2(count, regs->num_multi));
426	} else {
427		assert(layout_multi == SI_PC_MULTI_ALTERNATE);
428
429		*num_select_dw = 2 + count + MIN2(count, regs->num_multi) + regs->num_prelude;
430	}
431
432	*num_read_dw = 6 * count;
433}
434
435static void si_pc_emit_instance(struct r600_common_context *ctx,
436				int se, int instance)
437{
438	struct radeon_winsys_cs *cs = ctx->gfx.cs;
439	unsigned value = S_030800_SH_BROADCAST_WRITES(1);
440
441	if (se >= 0) {
442		value |= S_030800_SE_INDEX(se);
443	} else {
444		value |= S_030800_SE_BROADCAST_WRITES(1);
445	}
446
447	if (instance >= 0) {
448		value |= S_030800_INSTANCE_INDEX(instance);
449	} else {
450		value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
451	}
452
453	radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
454}
455
456static void si_pc_emit_shaders(struct r600_common_context *ctx,
457			       unsigned shaders)
458{
459	struct radeon_winsys_cs *cs = ctx->gfx.cs;
460
461	radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2);
462	radeon_emit(cs, shaders & 0x7f);
463	radeon_emit(cs, 0xffffffff);
464}
465
466static void si_pc_emit_select(struct r600_common_context *ctx,
467		        struct r600_perfcounter_block *group,
468		        unsigned count, unsigned *selectors)
469{
470	struct si_pc_block *sigroup = (struct si_pc_block *)group->data;
471	struct si_pc_block_base *regs = sigroup->b;
472	struct radeon_winsys_cs *cs = ctx->gfx.cs;
473	unsigned idx;
474	unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
475	unsigned dw;
476
477	assert(count <= regs->num_counters);
478
479	if (regs->layout & SI_PC_FAKE)
480		return;
481
482	if (layout_multi == SI_PC_MULTI_BLOCK) {
483		assert(!(regs->layout & SI_PC_REG_REVERSE));
484
485		dw = count + regs->num_prelude;
486		if (count >= regs->num_multi)
487			dw += regs->num_multi;
488		radeon_set_uconfig_reg_seq(cs, regs->select0, dw);
489		for (idx = 0; idx < regs->num_prelude; ++idx)
490			radeon_emit(cs, 0);
491		for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
492			radeon_emit(cs, selectors[idx] | regs->select_or);
493
494		if (count < regs->num_multi) {
495			unsigned select1 =
496				regs->select0 + 4 * regs->num_multi;
497			radeon_set_uconfig_reg_seq(cs, select1, count);
498		}
499
500		for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
501			radeon_emit(cs, 0);
502
503		if (count > regs->num_multi) {
504			for (idx = regs->num_multi; idx < count; ++idx)
505				radeon_emit(cs, selectors[idx] | regs->select_or);
506		}
507	} else if (layout_multi == SI_PC_MULTI_TAIL) {
508		unsigned select1, select1_count;
509
510		assert(!(regs->layout & SI_PC_REG_REVERSE));
511
512		radeon_set_uconfig_reg_seq(cs, regs->select0, count + regs->num_prelude);
513		for (idx = 0; idx < regs->num_prelude; ++idx)
514			radeon_emit(cs, 0);
515		for (idx = 0; idx < count; ++idx)
516			radeon_emit(cs, selectors[idx] | regs->select_or);
517
518		select1 = regs->select0 + 4 * regs->num_counters;
519		select1_count = MIN2(count, regs->num_multi);
520		radeon_set_uconfig_reg_seq(cs, select1, select1_count);
521		for (idx = 0; idx < select1_count; ++idx)
522			radeon_emit(cs, 0);
523	} else if (layout_multi == SI_PC_MULTI_CUSTOM) {
524		unsigned *reg = regs->select;
525		for (idx = 0; idx < count; ++idx) {
526			radeon_set_uconfig_reg(cs, *reg++, selectors[idx] | regs->select_or);
527			if (idx < regs->num_multi)
528				radeon_set_uconfig_reg(cs, *reg++, 0);
529		}
530	} else {
531		assert(layout_multi == SI_PC_MULTI_ALTERNATE);
532
533		unsigned reg_base = regs->select0;
534		unsigned reg_count = count + MIN2(count, regs->num_multi);
535		reg_count += regs->num_prelude;
536
537		if (!(regs->layout & SI_PC_REG_REVERSE)) {
538			radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
539
540			for (idx = 0; idx < regs->num_prelude; ++idx)
541				radeon_emit(cs, 0);
542			for (idx = 0; idx < count; ++idx) {
543				radeon_emit(cs, selectors[idx] | regs->select_or);
544				if (idx < regs->num_multi)
545					radeon_emit(cs, 0);
546			}
547		} else {
548			reg_base -= (reg_count - 1) * 4;
549			radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
550
551			for (idx = count; idx > 0; --idx) {
552				if (idx <= regs->num_multi)
553					radeon_emit(cs, 0);
554				radeon_emit(cs, selectors[idx - 1] | regs->select_or);
555			}
556			for (idx = 0; idx < regs->num_prelude; ++idx)
557				radeon_emit(cs, 0);
558		}
559	}
560}
561
562static void si_pc_emit_start(struct r600_common_context *ctx,
563			     struct r600_resource *buffer, uint64_t va)
564{
565	struct radeon_winsys_cs *cs = ctx->gfx.cs;
566
567	radeon_add_to_buffer_list(ctx, &ctx->gfx, buffer,
568				  RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
569
570	radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
571	radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) |
572			COPY_DATA_DST_SEL(COPY_DATA_MEM));
573	radeon_emit(cs, 1); /* immediate */
574	radeon_emit(cs, 0); /* unused */
575	radeon_emit(cs, va);
576	radeon_emit(cs, va >> 32);
577
578	radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
579			       S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET));
580	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
581	radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PERFCOUNTER_START) | EVENT_INDEX(0));
582	radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
583			       S_036020_PERFMON_STATE(V_036020_START_COUNTING));
584}
585
586/* Note: The buffer was already added in si_pc_emit_start, so we don't have to
587 * do it again in here. */
588static void si_pc_emit_stop(struct r600_common_context *ctx,
589			    struct r600_resource *buffer, uint64_t va)
590{
591	struct radeon_winsys_cs *cs = ctx->gfx.cs;
592
593	r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS, 0, 1,
594				 buffer, va, 1, 0);
595	r600_gfx_wait_fence(ctx, va, 0, 0xffffffff);
596
597	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
598	radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
599	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
600	radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PERFCOUNTER_STOP) | EVENT_INDEX(0));
601	radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
602			       S_036020_PERFMON_STATE(V_036020_STOP_COUNTING) |
603			       S_036020_PERFMON_SAMPLE_ENABLE(1));
604}
605
606static void si_pc_emit_read(struct r600_common_context *ctx,
607			    struct r600_perfcounter_block *group,
608			    unsigned count, unsigned *selectors,
609			    struct r600_resource *buffer, uint64_t va)
610{
611	struct si_pc_block *sigroup = (struct si_pc_block *)group->data;
612	struct si_pc_block_base *regs = sigroup->b;
613	struct radeon_winsys_cs *cs = ctx->gfx.cs;
614	unsigned idx;
615	unsigned reg = regs->counter0_lo;
616	unsigned reg_delta = 8;
617
618	if (!(regs->layout & SI_PC_FAKE)) {
619		if (regs->layout & SI_PC_REG_REVERSE)
620			reg_delta = -reg_delta;
621
622		for (idx = 0; idx < count; ++idx) {
623			if (regs->counters)
624				reg = regs->counters[idx];
625
626			radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
627			radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
628					COPY_DATA_DST_SEL(COPY_DATA_MEM) |
629					COPY_DATA_COUNT_SEL); /* 64 bits */
630			radeon_emit(cs, reg >> 2);
631			radeon_emit(cs, 0); /* unused */
632			radeon_emit(cs, va);
633			radeon_emit(cs, va >> 32);
634			va += sizeof(uint64_t);
635			reg += reg_delta;
636		}
637	} else {
638		for (idx = 0; idx < count; ++idx) {
639			radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
640			radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) |
641					COPY_DATA_DST_SEL(COPY_DATA_MEM) |
642					COPY_DATA_COUNT_SEL);
643			radeon_emit(cs, 0); /* immediate */
644			radeon_emit(cs, 0);
645			radeon_emit(cs, va);
646			radeon_emit(cs, va >> 32);
647			va += sizeof(uint64_t);
648		}
649	}
650}
651
652static void si_pc_cleanup(struct r600_common_screen *rscreen)
653{
654	r600_perfcounters_do_destroy(rscreen->perfcounters);
655	rscreen->perfcounters = NULL;
656}
657
658void si_init_perfcounters(struct si_screen *screen)
659{
660	struct r600_perfcounters *pc;
661	struct si_pc_block *blocks;
662	unsigned num_blocks;
663	unsigned i;
664
665	switch (screen->b.chip_class) {
666	case CIK:
667		blocks = groups_CIK;
668		num_blocks = ARRAY_SIZE(groups_CIK);
669		break;
670	case VI:
671		blocks = groups_VI;
672		num_blocks = ARRAY_SIZE(groups_VI);
673		break;
674	case SI:
675	default:
676		return; /* not implemented */
677	}
678
679	if (screen->b.info.max_sh_per_se != 1) {
680		/* This should not happen on non-SI chips. */
681		fprintf(stderr, "si_init_perfcounters: max_sh_per_se = %d not "
682			"supported (inaccurate performance counters)\n",
683			screen->b.info.max_sh_per_se);
684	}
685
686	pc = CALLOC_STRUCT(r600_perfcounters);
687	if (!pc)
688		return;
689
690	pc->num_start_cs_dwords = 14;
691	pc->num_stop_cs_dwords = 14 + r600_gfx_write_fence_dwords(&screen->b);
692	pc->num_instance_cs_dwords = 3;
693	pc->num_shaders_cs_dwords = 4;
694
695	pc->num_shader_types = ARRAY_SIZE(si_pc_shader_type_bits);
696	pc->shader_type_suffixes = si_pc_shader_type_suffixes;
697	pc->shader_type_bits = si_pc_shader_type_bits;
698
699	pc->get_size = si_pc_get_size;
700	pc->emit_instance = si_pc_emit_instance;
701	pc->emit_shaders = si_pc_emit_shaders;
702	pc->emit_select = si_pc_emit_select;
703	pc->emit_start = si_pc_emit_start;
704	pc->emit_stop = si_pc_emit_stop;
705	pc->emit_read = si_pc_emit_read;
706	pc->cleanup = si_pc_cleanup;
707
708	if (!r600_perfcounters_init(pc, num_blocks))
709		goto error;
710
711	for (i = 0; i < num_blocks; ++i) {
712		struct si_pc_block *block = &blocks[i];
713		unsigned instances = block->instances;
714
715		if (!strcmp(block->b->name, "IA")) {
716			if (screen->b.info.max_se > 2)
717				instances = 2;
718		}
719
720		r600_perfcounters_add_block(&screen->b, pc,
721					    block->b->name,
722					    block->b->flags,
723					    block->b->num_counters,
724					    block->selectors,
725					    instances,
726					    block);
727	}
728
729	screen->b.perfcounters = pc;
730	return;
731
732error:
733	r600_perfcounters_do_destroy(pc);
734}
735