r600_hw_context.c revision 696b6cf46609281711add5331b9c3e1d0240ecbc
1/*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 *      Jerome Glisse
25 */
26#include "r600_hw_context_priv.h"
27#include "radeonsi_pm4.h"
28#include "radeonsi_pipe.h"
29#include "sid.h"
30#include "util/u_memory.h"
31#include <errno.h>
32
33#define GROUP_FORCE_NEW_BLOCK	0
34
35/* Get backends mask */
36void r600_get_backend_mask(struct r600_context *ctx)
37{
38	struct radeon_winsys_cs *cs = ctx->cs;
39	struct si_resource *buffer;
40	uint32_t *results;
41	unsigned num_backends = ctx->screen->info.r600_num_backends;
42	unsigned i, mask = 0;
43
44	/* if backend_map query is supported by the kernel */
45	if (ctx->screen->info.r600_backend_map_valid) {
46		unsigned num_tile_pipes = ctx->screen->info.r600_num_tile_pipes;
47		unsigned backend_map = ctx->screen->info.r600_backend_map;
48		unsigned item_width, item_mask;
49
50		if (ctx->chip_class >= CAYMAN) {
51			item_width = 4;
52			item_mask = 0x7;
53		}
54
55		while(num_tile_pipes--) {
56			i = backend_map & item_mask;
57			mask |= (1<<i);
58			backend_map >>= item_width;
59		}
60		if (mask != 0) {
61			ctx->backend_mask = mask;
62			return;
63		}
64	}
65
66	/* otherwise backup path for older kernels */
67
68	/* create buffer for event data */
69	buffer = si_resource_create_custom(&ctx->screen->screen,
70					   PIPE_USAGE_STAGING,
71					   ctx->max_db*16);
72	if (!buffer)
73		goto err;
74
75	/* initialize buffer with zeroes */
76	results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
77	if (results) {
78		uint64_t va = 0;
79
80		memset(results, 0, ctx->max_db * 4 * 4);
81		ctx->ws->buffer_unmap(buffer->cs_buf);
82
83		/* emit EVENT_WRITE for ZPASS_DONE */
84		va = r600_resource_va(&ctx->screen->screen, (void *)buffer);
85		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
86		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
87		cs->buf[cs->cdw++] = va;
88		cs->buf[cs->cdw++] = va >> 32;
89
90		cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
91		cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, buffer, RADEON_USAGE_WRITE);
92
93		/* analyze results */
94		results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_READ);
95		if (results) {
96			for(i = 0; i < ctx->max_db; i++) {
97				/* at least highest bit will be set if backend is used */
98				if (results[i*4 + 1])
99					mask |= (1<<i);
100			}
101			ctx->ws->buffer_unmap(buffer->cs_buf);
102		}
103	}
104
105	si_resource_reference(&buffer, NULL);
106
107	if (mask != 0) {
108		ctx->backend_mask = mask;
109		return;
110	}
111
112err:
113	/* fallback to old method - set num_backends lower bits to 1 */
114	ctx->backend_mask = (~((uint32_t)0))>>(32-num_backends);
115	return;
116}
117
118/* initialize */
119void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
120			boolean count_draw_in)
121{
122	struct r600_atom *state;
123
124	/* The number of dwords we already used in the CS so far. */
125	num_dw += ctx->cs->cdw;
126
127	if (count_draw_in) {
128		/* The number of dwords all the dirty states would take. */
129		LIST_FOR_EACH_ENTRY(state, &ctx->dirty_states, head) {
130			num_dw += state->num_dw;
131		}
132
133		num_dw += ctx->pm4_dirty_cdwords;
134
135		/* The upper-bound of how much a draw command would take. */
136		num_dw += SI_MAX_DRAW_CS_DWORDS;
137	}
138
139	/* Count in queries_suspend. */
140	num_dw += ctx->num_cs_dw_queries_suspend;
141
142	/* Count in streamout_end at the end of CS. */
143	num_dw += ctx->num_cs_dw_streamout_end;
144
145	/* Count in render_condition(NULL) at the end of CS. */
146	if (ctx->predicate_drawing) {
147		num_dw += 3;
148	}
149
150	/* Count in framebuffer cache flushes at the end of CS. */
151	num_dw += 7; /* one SURFACE_SYNC and CACHE_FLUSH_AND_INV (r6xx-only) */
152
153	/* Save 16 dwords for the fence mechanism. */
154	num_dw += 16;
155
156	/* Flush if there's not enough space. */
157	if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
158		radeonsi_flush(&ctx->context, NULL, RADEON_FLUSH_ASYNC);
159	}
160}
161
162static void r600_flush_framebuffer(struct r600_context *ctx, bool flush_now)
163{
164	if (!(ctx->flags & R600_CONTEXT_DST_CACHES_DIRTY))
165		return;
166
167	ctx->atom_surface_sync.flush_flags |=
168		r600_get_cb_flush_flags(ctx) |
169		(ctx->framebuffer.zsbuf ? S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1) : 0);
170
171	if (flush_now) {
172		r600_emit_atom(ctx, &ctx->atom_surface_sync.atom);
173	} else {
174		r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom);
175	}
176
177	ctx->flags &= ~R600_CONTEXT_DST_CACHES_DIRTY;
178}
179
180void r600_context_flush(struct r600_context *ctx, unsigned flags)
181{
182	struct radeon_winsys_cs *cs = ctx->cs;
183	struct r600_block *enable_block = NULL;
184	bool queries_suspended = false;
185	bool streamout_suspended = false;
186
187	if (!cs->cdw)
188		return;
189
190	/* suspend queries */
191	if (ctx->num_cs_dw_queries_suspend) {
192		r600_context_queries_suspend(ctx);
193		queries_suspended = true;
194	}
195
196	if (ctx->num_cs_dw_streamout_end) {
197		r600_context_streamout_end(ctx);
198		streamout_suspended = true;
199	}
200
201	r600_flush_framebuffer(ctx, true);
202
203	/* partial flush is needed to avoid lockups on some chips with user fences */
204	cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
205	cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
206
207	/* force to keep tiling flags */
208	flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
209
210	/* Flush the CS. */
211	ctx->ws->cs_flush(ctx->cs, flags);
212
213	ctx->pm4_dirty_cdwords = 0;
214	ctx->flags = 0;
215
216	if (streamout_suspended) {
217		ctx->streamout_start = TRUE;
218		ctx->streamout_append_bitmask = ~0;
219	}
220
221	/* resume queries */
222	if (queries_suspended) {
223		r600_context_queries_resume(ctx);
224	}
225
226	/* set all valid group as dirty so they get reemited on
227	 * next draw command
228	 */
229	si_pm4_reset_emitted(ctx);
230}
231
232void r600_context_emit_fence(struct r600_context *ctx, struct si_resource *fence_bo, unsigned offset, unsigned value)
233{
234	struct radeon_winsys_cs *cs = ctx->cs;
235	uint64_t va;
236
237	r600_need_cs_space(ctx, 10, FALSE);
238
239	va = r600_resource_va(&ctx->screen->screen, (void*)fence_bo);
240	va = va + (offset << 2);
241
242	cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
243	cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
244	cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
245	cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
246	cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL;       /* ADDRESS_LO */
247	/* DATA_SEL | INT_EN | ADDRESS_HI */
248	cs->buf[cs->cdw++] = (1 << 29) | (0 << 24) | ((va >> 32UL) & 0xFF);
249	cs->buf[cs->cdw++] = value;                   /* DATA_LO */
250	cs->buf[cs->cdw++] = 0;                       /* DATA_HI */
251	cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
252	cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, fence_bo, RADEON_USAGE_WRITE);
253}
254
255static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned end_index,
256				       bool test_status_bit)
257{
258	uint32_t *current_result = (uint32_t*)map;
259	uint64_t start, end;
260
261	start = (uint64_t)current_result[start_index] |
262		(uint64_t)current_result[start_index+1] << 32;
263	end = (uint64_t)current_result[end_index] |
264	      (uint64_t)current_result[end_index+1] << 32;
265
266	if (!test_status_bit ||
267	    ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
268		return end - start;
269	}
270	return 0;
271}
272
273static boolean r600_query_result(struct r600_context *ctx, struct r600_query *query, boolean wait)
274{
275	unsigned results_base = query->results_start;
276	char *map;
277
278	map = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs,
279				  PIPE_TRANSFER_READ |
280				  (wait ? 0 : PIPE_TRANSFER_DONTBLOCK));
281	if (!map)
282		return FALSE;
283
284	/* count all results across all data blocks */
285	switch (query->type) {
286	case PIPE_QUERY_OCCLUSION_COUNTER:
287		while (results_base != query->results_end) {
288			query->result.u64 +=
289				r600_query_read_result(map + results_base, 0, 2, true);
290			results_base = (results_base + 16) % query->buffer->b.b.width0;
291		}
292		break;
293	case PIPE_QUERY_OCCLUSION_PREDICATE:
294		while (results_base != query->results_end) {
295			query->result.b = query->result.b ||
296				r600_query_read_result(map + results_base, 0, 2, true) != 0;
297			results_base = (results_base + 16) % query->buffer->b.b.width0;
298		}
299		break;
300	case PIPE_QUERY_TIME_ELAPSED:
301		while (results_base != query->results_end) {
302			query->result.u64 +=
303				r600_query_read_result(map + results_base, 0, 2, false);
304			results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
305		}
306		break;
307	case PIPE_QUERY_PRIMITIVES_EMITTED:
308		/* SAMPLE_STREAMOUTSTATS stores this structure:
309		 * {
310		 *    u64 NumPrimitivesWritten;
311		 *    u64 PrimitiveStorageNeeded;
312		 * }
313		 * We only need NumPrimitivesWritten here. */
314		while (results_base != query->results_end) {
315			query->result.u64 +=
316				r600_query_read_result(map + results_base, 2, 6, true);
317			results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
318		}
319		break;
320	case PIPE_QUERY_PRIMITIVES_GENERATED:
321		/* Here we read PrimitiveStorageNeeded. */
322		while (results_base != query->results_end) {
323			query->result.u64 +=
324				r600_query_read_result(map + results_base, 0, 4, true);
325			results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
326		}
327		break;
328	case PIPE_QUERY_SO_STATISTICS:
329		while (results_base != query->results_end) {
330			query->result.so.num_primitives_written +=
331				r600_query_read_result(map + results_base, 2, 6, true);
332			query->result.so.primitives_storage_needed +=
333				r600_query_read_result(map + results_base, 0, 4, true);
334			results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
335		}
336		break;
337	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
338		while (results_base != query->results_end) {
339			query->result.b = query->result.b ||
340				r600_query_read_result(map + results_base, 2, 6, true) !=
341				r600_query_read_result(map + results_base, 0, 4, true);
342			results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
343		}
344		break;
345	default:
346		assert(0);
347	}
348
349	query->results_start = query->results_end;
350	ctx->ws->buffer_unmap(query->buffer->cs_buf);
351	return TRUE;
352}
353
354void r600_query_begin(struct r600_context *ctx, struct r600_query *query)
355{
356	struct radeon_winsys_cs *cs = ctx->cs;
357	unsigned new_results_end, i;
358	uint32_t *results;
359	uint64_t va;
360
361	r600_need_cs_space(ctx, query->num_cs_dw * 2, TRUE);
362
363	new_results_end = (query->results_end + query->result_size) % query->buffer->b.b.width0;
364
365	/* collect current results if query buffer is full */
366	if (new_results_end == query->results_start) {
367		r600_query_result(ctx, query, TRUE);
368	}
369
370	switch (query->type) {
371	case PIPE_QUERY_OCCLUSION_COUNTER:
372	case PIPE_QUERY_OCCLUSION_PREDICATE:
373		results = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
374		if (results) {
375			results = (uint32_t*)((char*)results + query->results_end);
376			memset(results, 0, query->result_size);
377
378			/* Set top bits for unused backends */
379			for (i = 0; i < ctx->max_db; i++) {
380				if (!(ctx->backend_mask & (1<<i))) {
381					results[(i * 4)+1] = 0x80000000;
382					results[(i * 4)+3] = 0x80000000;
383				}
384			}
385			ctx->ws->buffer_unmap(query->buffer->cs_buf);
386		}
387		break;
388	case PIPE_QUERY_TIME_ELAPSED:
389		break;
390	case PIPE_QUERY_PRIMITIVES_EMITTED:
391	case PIPE_QUERY_PRIMITIVES_GENERATED:
392	case PIPE_QUERY_SO_STATISTICS:
393	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
394		results = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
395		results = (uint32_t*)((char*)results + query->results_end);
396		memset(results, 0, query->result_size);
397		ctx->ws->buffer_unmap(query->buffer->cs_buf);
398		break;
399	default:
400		assert(0);
401	}
402
403	/* emit begin query */
404	va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
405	va += query->results_end;
406
407	switch (query->type) {
408	case PIPE_QUERY_OCCLUSION_COUNTER:
409	case PIPE_QUERY_OCCLUSION_PREDICATE:
410		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
411		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
412		cs->buf[cs->cdw++] = va;
413		cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
414		break;
415	case PIPE_QUERY_PRIMITIVES_EMITTED:
416	case PIPE_QUERY_PRIMITIVES_GENERATED:
417	case PIPE_QUERY_SO_STATISTICS:
418	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
419		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
420		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
421		cs->buf[cs->cdw++] = query->results_end;
422		cs->buf[cs->cdw++] = 0;
423		break;
424	case PIPE_QUERY_TIME_ELAPSED:
425		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
426		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
427		cs->buf[cs->cdw++] = va;
428		cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
429		cs->buf[cs->cdw++] = 0;
430		cs->buf[cs->cdw++] = 0;
431		break;
432	default:
433		assert(0);
434	}
435	cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
436	cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
437
438	ctx->num_cs_dw_queries_suspend += query->num_cs_dw;
439}
440
441void r600_query_end(struct r600_context *ctx, struct r600_query *query)
442{
443	struct radeon_winsys_cs *cs = ctx->cs;
444	uint64_t va;
445
446	va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
447	/* emit end query */
448	switch (query->type) {
449	case PIPE_QUERY_OCCLUSION_COUNTER:
450	case PIPE_QUERY_OCCLUSION_PREDICATE:
451		va += query->results_end + 8;
452		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
453		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
454		cs->buf[cs->cdw++] = va;
455		cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
456		break;
457	case PIPE_QUERY_PRIMITIVES_EMITTED:
458	case PIPE_QUERY_PRIMITIVES_GENERATED:
459	case PIPE_QUERY_SO_STATISTICS:
460	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
461		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
462		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
463		cs->buf[cs->cdw++] = query->results_end + query->result_size/2;
464		cs->buf[cs->cdw++] = 0;
465		break;
466	case PIPE_QUERY_TIME_ELAPSED:
467		va += query->results_end + query->result_size/2;
468		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
469		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
470		cs->buf[cs->cdw++] = va;
471		cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
472		cs->buf[cs->cdw++] = 0;
473		cs->buf[cs->cdw++] = 0;
474		break;
475	default:
476		assert(0);
477	}
478	cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
479	cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
480
481	query->results_end = (query->results_end + query->result_size) % query->buffer->b.b.width0;
482	ctx->num_cs_dw_queries_suspend -= query->num_cs_dw;
483}
484
485void r600_query_predication(struct r600_context *ctx, struct r600_query *query, int operation,
486			    int flag_wait)
487{
488	struct radeon_winsys_cs *cs = ctx->cs;
489	uint64_t va;
490
491	if (operation == PREDICATION_OP_CLEAR) {
492		r600_need_cs_space(ctx, 3, FALSE);
493
494		cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
495		cs->buf[cs->cdw++] = 0;
496		cs->buf[cs->cdw++] = PRED_OP(PREDICATION_OP_CLEAR);
497	} else {
498		unsigned results_base = query->results_start;
499		unsigned count;
500		uint32_t op;
501
502		/* find count of the query data blocks */
503		count = (query->buffer->b.b.width0 + query->results_end - query->results_start) % query->buffer->b.b.width0;
504		count /= query->result_size;
505
506		r600_need_cs_space(ctx, 5 * count, TRUE);
507
508		op = PRED_OP(operation) | PREDICATION_DRAW_VISIBLE |
509				(flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW);
510		va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
511
512		/* emit predicate packets for all data blocks */
513		while (results_base != query->results_end) {
514			cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
515			cs->buf[cs->cdw++] = (va + results_base) & 0xFFFFFFFFUL;
516			cs->buf[cs->cdw++] = op | (((va + results_base) >> 32UL) & 0xFF);
517			cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
518			cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer,
519									     RADEON_USAGE_READ);
520			results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
521
522			/* set CONTINUE bit for all packets except the first */
523			op |= PREDICATION_CONTINUE;
524		}
525	}
526}
527
528struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned query_type)
529{
530	struct r600_query *query;
531	unsigned buffer_size = 4096;
532
533	query = CALLOC_STRUCT(r600_query);
534	if (query == NULL)
535		return NULL;
536
537	query->type = query_type;
538
539	switch (query_type) {
540	case PIPE_QUERY_OCCLUSION_COUNTER:
541	case PIPE_QUERY_OCCLUSION_PREDICATE:
542		query->result_size = 16 * ctx->max_db;
543		query->num_cs_dw = 6;
544		break;
545	case PIPE_QUERY_TIME_ELAPSED:
546		query->result_size = 16;
547		query->num_cs_dw = 8;
548		break;
549	case PIPE_QUERY_PRIMITIVES_EMITTED:
550	case PIPE_QUERY_PRIMITIVES_GENERATED:
551	case PIPE_QUERY_SO_STATISTICS:
552	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
553		/* NumPrimitivesWritten, PrimitiveStorageNeeded. */
554		query->result_size = 32;
555		query->num_cs_dw = 6;
556		break;
557	default:
558		assert(0);
559		FREE(query);
560		return NULL;
561	}
562
563	/* adjust buffer size to simplify offsets wrapping math */
564	buffer_size -= buffer_size % query->result_size;
565
566	/* Queries are normally read by the CPU after
567	 * being written by the gpu, hence staging is probably a good
568	 * usage pattern.
569	 */
570	query->buffer = si_resource_create_custom(&ctx->screen->screen,
571						  PIPE_USAGE_STAGING,
572						  buffer_size);
573	if (!query->buffer) {
574		FREE(query);
575		return NULL;
576	}
577	return query;
578}
579
580void r600_context_query_destroy(struct r600_context *ctx, struct r600_query *query)
581{
582	si_resource_reference(&query->buffer, NULL);
583	free(query);
584}
585
586boolean r600_context_query_result(struct r600_context *ctx,
587				struct r600_query *query,
588				boolean wait, void *vresult)
589{
590	boolean *result_b = (boolean*)vresult;
591	uint64_t *result_u64 = (uint64_t*)vresult;
592	struct pipe_query_data_so_statistics *result_so =
593		(struct pipe_query_data_so_statistics*)vresult;
594
595	if (!r600_query_result(ctx, query, wait))
596		return FALSE;
597
598	switch (query->type) {
599	case PIPE_QUERY_OCCLUSION_COUNTER:
600	case PIPE_QUERY_PRIMITIVES_EMITTED:
601	case PIPE_QUERY_PRIMITIVES_GENERATED:
602		*result_u64 = query->result.u64;
603		break;
604	case PIPE_QUERY_OCCLUSION_PREDICATE:
605	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
606		*result_b = query->result.b;
607		break;
608	case PIPE_QUERY_TIME_ELAPSED:
609		*result_u64 = (1000000 * query->result.u64) / ctx->screen->info.r600_clock_crystal_freq;
610		break;
611	case PIPE_QUERY_SO_STATISTICS:
612		*result_so = query->result.so;
613		break;
614	default:
615		assert(0);
616	}
617	return TRUE;
618}
619
620void r600_context_queries_suspend(struct r600_context *ctx)
621{
622	struct r600_query *query;
623
624	LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) {
625		r600_query_end(ctx, query);
626	}
627	assert(ctx->num_cs_dw_queries_suspend == 0);
628}
629
630void r600_context_queries_resume(struct r600_context *ctx)
631{
632	struct r600_query *query;
633
634	assert(ctx->num_cs_dw_queries_suspend == 0);
635
636	LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) {
637		r600_query_begin(ctx, query);
638	}
639}
640
641void r600_context_streamout_begin(struct r600_context *ctx)
642{
643	struct radeon_winsys_cs *cs = ctx->cs;
644	struct r600_so_target **t = ctx->so_targets;
645	unsigned *strides = ctx->vs_shader_so_strides;
646	unsigned buffer_en, i;
647
648	buffer_en = (ctx->num_so_targets >= 1 && t[0] ? 1 : 0) |
649		    (ctx->num_so_targets >= 2 && t[1] ? 2 : 0) |
650		    (ctx->num_so_targets >= 3 && t[2] ? 4 : 0) |
651		    (ctx->num_so_targets >= 4 && t[3] ? 8 : 0);
652
653	ctx->num_cs_dw_streamout_end =
654		12 + /* flush_vgt_streamout */
655		util_bitcount(buffer_en) * 8 +
656		3;
657
658	r600_need_cs_space(ctx,
659			   12 + /* flush_vgt_streamout */
660			   6 + /* enables */
661			   util_bitcount(buffer_en & ctx->streamout_append_bitmask) * 8 +
662			   util_bitcount(buffer_en & ~ctx->streamout_append_bitmask) * 6 +
663			   ctx->num_cs_dw_streamout_end, TRUE);
664
665	if (ctx->chip_class >= CAYMAN) {
666		evergreen_flush_vgt_streamout(ctx);
667		evergreen_set_streamout_enable(ctx, buffer_en);
668	}
669
670	for (i = 0; i < ctx->num_so_targets; i++) {
671#if 0
672		if (t[i]) {
673			t[i]->stride = strides[i];
674			t[i]->so_index = i;
675
676			cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 3, 0);
677			cs->buf[cs->cdw++] = (R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 +
678							16*i - SI_CONTEXT_REG_OFFSET) >> 2;
679			cs->buf[cs->cdw++] = (t[i]->b.buffer_offset +
680							t[i]->b.buffer_size) >> 2; /* BUFFER_SIZE (in DW) */
681			cs->buf[cs->cdw++] = strides[i] >> 2;		   /* VTX_STRIDE (in DW) */
682			cs->buf[cs->cdw++] = 0;			   /* BUFFER_BASE */
683
684			cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
685			cs->buf[cs->cdw++] =
686				r600_context_bo_reloc(ctx, si_resource(t[i]->b.buffer),
687						      RADEON_USAGE_WRITE);
688
689			if (ctx->streamout_append_bitmask & (1 << i)) {
690				/* Append. */
691				cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
692				cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) |
693							       STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM); /* control */
694				cs->buf[cs->cdw++] = 0; /* unused */
695				cs->buf[cs->cdw++] = 0; /* unused */
696				cs->buf[cs->cdw++] = 0; /* src address lo */
697				cs->buf[cs->cdw++] = 0; /* src address hi */
698
699				cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
700				cs->buf[cs->cdw++] =
701					r600_context_bo_reloc(ctx,  t[i]->filled_size,
702							      RADEON_USAGE_READ);
703			} else {
704				/* Start from the beginning. */
705				cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
706				cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) |
707							       STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET); /* control */
708				cs->buf[cs->cdw++] = 0; /* unused */
709				cs->buf[cs->cdw++] = 0; /* unused */
710				cs->buf[cs->cdw++] = t[i]->b.buffer_offset >> 2; /* buffer offset in DW */
711				cs->buf[cs->cdw++] = 0; /* unused */
712			}
713		}
714#endif
715	}
716}
717
718void r600_context_streamout_end(struct r600_context *ctx)
719{
720	struct radeon_winsys_cs *cs = ctx->cs;
721	struct r600_so_target **t = ctx->so_targets;
722	unsigned i, flush_flags = 0;
723
724	evergreen_flush_vgt_streamout(ctx);
725
726	for (i = 0; i < ctx->num_so_targets; i++) {
727#if 0
728		if (t[i]) {
729			cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
730			cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) |
731						       STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
732						       STRMOUT_STORE_BUFFER_FILLED_SIZE; /* control */
733			cs->buf[cs->cdw++] = 0; /* dst address lo */
734			cs->buf[cs->cdw++] = 0; /* dst address hi */
735			cs->buf[cs->cdw++] = 0; /* unused */
736			cs->buf[cs->cdw++] = 0; /* unused */
737
738			cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
739			cs->buf[cs->cdw++] =
740				r600_context_bo_reloc(ctx,  t[i]->filled_size,
741						      RADEON_USAGE_WRITE);
742
743			flush_flags |= S_0085F0_SO0_DEST_BASE_ENA(1) << i;
744		}
745#endif
746	}
747
748	evergreen_set_streamout_enable(ctx, 0);
749
750	ctx->atom_surface_sync.flush_flags |= flush_flags;
751	r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom);
752
753	ctx->num_cs_dw_streamout_end = 0;
754
755	/* XXX print some debug info */
756	for (i = 0; i < ctx->num_so_targets; i++) {
757		if (!t[i])
758			continue;
759
760		uint32_t *ptr = ctx->ws->buffer_map(t[i]->filled_size->cs_buf, ctx->cs, RADEON_USAGE_READ);
761		printf("FILLED_SIZE%i: %u\n", i, *ptr);
762		ctx->ws->buffer_unmap(t[i]->filled_size->cs_buf);
763	}
764}
765
766void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t)
767{
768	struct radeon_winsys_cs *cs = ctx->cs;
769	r600_need_cs_space(ctx, 14 + 21, TRUE);
770
771	cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
772	cs->buf[cs->cdw++] = (R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET - SI_CONTEXT_REG_OFFSET) >> 2;
773	cs->buf[cs->cdw++] = 0;
774
775	cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
776	cs->buf[cs->cdw++] = (R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE - SI_CONTEXT_REG_OFFSET) >> 2;
777	cs->buf[cs->cdw++] = t->stride >> 2;
778
779#if 0
780	cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0);
781	cs->buf[cs->cdw++] = COPY_DW_SRC_IS_MEM | COPY_DW_DST_IS_REG;
782	cs->buf[cs->cdw++] = 0; /* src address lo */
783	cs->buf[cs->cdw++] = 0; /* src address hi */
784	cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* dst register */
785	cs->buf[cs->cdw++] = 0; /* unused */
786#endif
787
788	cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
789	cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, t->filled_size, RADEON_USAGE_READ);
790
791#if 0 /* I have not found this useful yet. */
792	cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0);
793	cs->buf[cs->cdw++] = COPY_DW_SRC_IS_REG | COPY_DW_DST_IS_REG;
794	cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* src register */
795	cs->buf[cs->cdw++] = 0; /* unused */
796	cs->buf[cs->cdw++] = R_0085F4_CP_COHER_SIZE >> 2; /* dst register */
797	cs->buf[cs->cdw++] = 0; /* unused */
798
799	cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
800	cs->buf[cs->cdw++] = (R_0085F0_CP_COHER_CNTL - SI_CONFIG_REG_OFFSET) >> 2;
801	cs->buf[cs->cdw++] = S_0085F0_SO0_DEST_BASE_ENA(1) << t->so_index;
802
803	cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
804	cs->buf[cs->cdw++] = (R_0085F8_CP_COHER_BASE - SI_CONFIG_REG_OFFSET) >> 2;
805	cs->buf[cs->cdw++] = t->b.buffer_offset >> 2;
806
807	cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
808	cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, (struct si_resource*)t->b.buffer,
809							     RADEON_USAGE_WRITE);
810
811	cs->buf[cs->cdw++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0);
812	cs->buf[cs->cdw++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */
813	cs->buf[cs->cdw++] = R_0085FC_CP_COHER_STATUS >> 2;  /* register */
814	cs->buf[cs->cdw++] = 0;
815	cs->buf[cs->cdw++] = 0; /* reference value */
816	cs->buf[cs->cdw++] = 0xffffffff; /* mask */
817	cs->buf[cs->cdw++] = 4; /* poll interval */
818#endif
819}
820