evergreen_compute.c revision f986087d5ce7b0dee3287263acae856ea70e0777
1/*
2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 *      Adam Rak <adam.rak@streamnovation.com>
25 */
26
27#include <stdio.h>
28#include <errno.h>
29#include "pipe/p_defines.h"
30#include "pipe/p_state.h"
31#include "pipe/p_context.h"
32#include "util/u_blitter.h"
33#include "util/u_double_list.h"
34#include "util/u_transfer.h"
35#include "util/u_surface.h"
36#include "util/u_pack_color.h"
37#include "util/u_memory.h"
38#include "util/u_inlines.h"
39#include "util/u_framebuffer.h"
40#include "pipebuffer/pb_buffer.h"
41#include "evergreend.h"
42#include "r600_resource.h"
43#include "r600_shader.h"
44#include "r600_pipe.h"
45#include "r600_formats.h"
46#include "evergreen_compute.h"
47#include "evergreen_compute_internal.h"
48#include "compute_memory_pool.h"
49#ifdef HAVE_OPENCL
50#include "radeon_llvm_util.h"
51#endif
52
53/**
54RAT0 is for global binding write
55VTX1 is for global binding read
56
57for wrting images RAT1...
58for reading images TEX2...
59  TEX2-RAT1 is paired
60
61TEX2... consumes the same fetch resources, that VTX2... would consume
62
63CONST0 and VTX0 is for parameters
64  CONST0 is binding smaller input parameter buffer, and for constant indexing,
65  also constant cached
66  VTX0 is for indirect/non-constant indexing, or if the input is bigger than
67  the constant cache can handle
68
69RAT-s are limited to 12, so we can only bind at most 11 texture for writing
70because we reserve RAT0 for global bindings. With byteaddressing enabled,
71we should reserve another one too.=> 10 image binding for writing max.
72
73from Nvidia OpenCL:
74  CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
75  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
76
77so 10 for writing is enough. 176 is the max for reading according to the docs
78
79writable images should be listed first < 10, so their id corresponds to RAT(id+1)
80writable images will consume TEX slots, VTX slots too because of linear indexing
81
82*/
83
84static void evergreen_cs_set_vertex_buffer(
85	struct r600_context * rctx,
86	unsigned vb_index,
87	unsigned offset,
88	struct pipe_resource * buffer)
89{
90	struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
91	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
92	vb->stride = 1;
93	vb->buffer_offset = offset;
94	vb->buffer = buffer;
95	vb->user_buffer = NULL;
96
97	/* The vertex instructions in the compute shaders use the texture cache,
98	 * so we need to invalidate it. */
99	rctx->flags |= R600_CONTEXT_INVAL_READ_CACHES;
100	state->enabled_mask |= 1 << vb_index;
101	state->dirty_mask |= 1 << vb_index;
102	state->atom.dirty = true;
103}
104
105static void evergreen_cs_set_constant_buffer(
106	struct r600_context * rctx,
107	unsigned cb_index,
108	unsigned offset,
109	unsigned size,
110	struct pipe_resource * buffer)
111{
112	struct pipe_constant_buffer cb;
113	cb.buffer_size = size;
114	cb.buffer_offset = offset;
115	cb.buffer = buffer;
116	cb.user_buffer = NULL;
117
118	rctx->context.set_constant_buffer(&rctx->context, PIPE_SHADER_COMPUTE, cb_index, &cb);
119}
120
121static const struct u_resource_vtbl r600_global_buffer_vtbl =
122{
123	u_default_resource_get_handle, /* get_handle */
124	r600_compute_global_buffer_destroy, /* resource_destroy */
125	r600_compute_global_transfer_map, /* transfer_map */
126	r600_compute_global_transfer_flush_region,/* transfer_flush_region */
127	r600_compute_global_transfer_unmap, /* transfer_unmap */
128	r600_compute_global_transfer_inline_write /* transfer_inline_write */
129};
130
131
132void *evergreen_create_compute_state(
133	struct pipe_context *ctx_,
134	const const struct pipe_compute_state *cso)
135{
136	struct r600_context *ctx = (struct r600_context *)ctx_;
137	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
138
139#ifdef HAVE_OPENCL
140	const struct pipe_llvm_program_header * header;
141	const unsigned char * code;
142	unsigned i;
143
144	COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
145
146	header = cso->prog;
147	code = cso->prog + sizeof(struct pipe_llvm_program_header);
148#endif
149
150	shader->ctx = (struct r600_context*)ctx;
151	shader->resources = (struct evergreen_compute_resource*)
152			CALLOC(sizeof(struct evergreen_compute_resource),
153			get_compute_resource_num());
154	shader->local_size = cso->req_local_mem; ///TODO: assert it
155	shader->private_size = cso->req_private_mem;
156	shader->input_size = cso->req_input_mem;
157
158#ifdef HAVE_OPENCL
159	shader->num_kernels = radeon_llvm_get_num_kernels(code, header->num_bytes);
160	shader->kernels = CALLOC(sizeof(struct r600_kernel), shader->num_kernels);
161
162	for (i = 0; i < shader->num_kernels; i++) {
163		struct r600_kernel *kernel = &shader->kernels[i];
164		kernel->llvm_module = radeon_llvm_get_kernel_module(i, code,
165							header->num_bytes);
166	}
167#endif
168	return shader;
169}
170
171void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
172{
173	struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
174
175	free(shader->resources);
176	free(shader);
177}
178
179static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
180{
181	struct r600_context *ctx = (struct r600_context *)ctx_;
182
183	COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
184
185	ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
186}
187
188/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
189 * kernel parameters there are inplicit parameters that need to be stored
190 * in the vertex buffer as well.  Here is how these parameters are organized in
191 * the buffer:
192 *
193 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
194 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
195 * DWORDS 6-8: Number of work items within each work group in each dimension
196 *             (x,y,z)
197 * DWORDS 9+ : Kernel parameters
198 */
199void evergreen_compute_upload_input(
200	struct pipe_context *ctx_,
201	const uint *block_layout,
202	const uint *grid_layout,
203	const void *input)
204{
205	struct r600_context *ctx = (struct r600_context *)ctx_;
206	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
207	int i;
208	/* We need to reserve 9 dwords (36 bytes) for implicit kernel
209	 * parameters.
210	 */
211	unsigned input_size = shader->input_size + 36;
212	uint32_t * num_work_groups_start;
213	uint32_t * global_size_start;
214	uint32_t * local_size_start;
215	uint32_t * kernel_parameters_start;
216
217	if (shader->input_size == 0) {
218		return;
219	}
220
221	if (!shader->kernel_param) {
222		/* Add space for the grid dimensions */
223		shader->kernel_param = r600_compute_buffer_alloc_vram(
224						ctx->screen, input_size);
225	}
226
227	num_work_groups_start = r600_buffer_mmap_sync_with_rings(ctx, shader->kernel_param, PIPE_TRANSFER_WRITE);
228	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
229	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
230	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
231
232	/* Copy the work group size */
233	memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
234
235	/* Copy the global size */
236	for (i = 0; i < 3; i++) {
237		global_size_start[i] = grid_layout[i] * block_layout[i];
238	}
239
240	/* Copy the local dimensions */
241	memcpy(local_size_start, block_layout, 3 * sizeof(uint));
242
243	/* Copy the kernel inputs */
244	memcpy(kernel_parameters_start, input, shader->input_size);
245
246	for (i = 0; i < (input_size / 4); i++) {
247		COMPUTE_DBG(ctx->screen, "input %i : %i\n", i,
248			((unsigned*)num_work_groups_start)[i]);
249	}
250
251	ctx->ws->buffer_unmap(shader->kernel_param->cs_buf);
252
253	/* ID=0 is reserved for the parameters */
254	evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
255			(struct pipe_resource*)shader->kernel_param);
256}
257
258static void evergreen_emit_direct_dispatch(
259		struct r600_context *rctx,
260		const uint *block_layout, const uint *grid_layout)
261{
262	int i;
263	struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
264	unsigned num_waves;
265	unsigned num_pipes = rctx->screen->info.r600_max_pipes;
266	unsigned wave_divisor = (16 * num_pipes);
267	int group_size = 1;
268	int grid_size = 1;
269	/* XXX: Enable lds and get size from cs_shader_state */
270	unsigned lds_size = 0;
271
272	/* Calculate group_size/grid_size */
273	for (i = 0; i < 3; i++) {
274		group_size *= block_layout[i];
275	}
276
277	for (i = 0; i < 3; i++)	{
278		grid_size *= grid_layout[i];
279	}
280
281	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
282	num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
283			wave_divisor - 1) / wave_divisor;
284
285	COMPUTE_DBG(rctx->screen, "Using %u pipes, there are %u wavefronts per thread block\n",
286							num_pipes, num_waves);
287
288	/* XXX: Partition the LDS between PS/CS.  By default half (4096 dwords
289	 * on Evergreen) oes to Pixel Shaders and half goes to Compute Shaders.
290	 * We may need to allocat the entire LDS space for Compute Shaders.
291	 *
292	 * EG: R_008E2C_SQ_LDS_RESOURCE_MGMT := S_008E2C_NUM_LS_LDS(lds_dwords)
293	 * CM: CM_R_0286FC_SPI_LDS_MGMT :=  S_0286FC_NUM_LS_LDS(lds_dwords)
294	 */
295
296	r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
297
298	r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
299	r600_write_value(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
300	r600_write_value(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
301	r600_write_value(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
302
303	r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
304								group_size);
305
306	r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
307	r600_write_value(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
308	r600_write_value(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
309	r600_write_value(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
310
311	r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC,
312					lds_size | (num_waves << 14));
313
314	/* Dispatch packet */
315	r600_write_value(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
316	r600_write_value(cs, grid_layout[0]);
317	r600_write_value(cs, grid_layout[1]);
318	r600_write_value(cs, grid_layout[2]);
319	/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
320	r600_write_value(cs, 1);
321}
322
323static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
324		const uint *grid_layout)
325{
326	struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
327	unsigned flush_flags = 0;
328	int i;
329	struct evergreen_compute_resource *resources =
330					ctx->cs_shader_state.shader->resources;
331
332	/* make sure that the gfx ring is only one active */
333	if (ctx->rings.dma.cs) {
334		ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC);
335	}
336
337	/* Initialize all the compute-related registers.
338	 *
339	 * See evergreen_init_atom_start_compute_cs() in this file for the list
340	 * of registers initialized by the start_compute_cs_cmd atom.
341	 */
342	r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
343
344	ctx->flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
345	r600_flush_emit(ctx);
346
347	/* Emit colorbuffers. */
348	for (i = 0; i < ctx->framebuffer.state.nr_cbufs; i++) {
349		struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
350		unsigned reloc = r600_context_bo_reloc(ctx, &ctx->rings.gfx,
351						       (struct r600_resource*)cb->base.texture,
352						       RADEON_USAGE_READWRITE);
353
354		r600_write_compute_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
355		r600_write_value(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
356		r600_write_value(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
357		r600_write_value(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
358		r600_write_value(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
359		r600_write_value(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
360		r600_write_value(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
361		r600_write_value(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */
362
363		r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
364		r600_write_value(cs, reloc);
365
366		if (!ctx->keep_tiling_flags) {
367			r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
368			r600_write_value(cs, reloc);
369		}
370
371		r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
372		r600_write_value(cs, reloc);
373	}
374
375	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
376	r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK,
377					ctx->compute_cb_target_mask);
378
379
380	/* Emit vertex buffer state */
381	ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
382	r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
383
384	/* Emit constant buffer state */
385	r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
386
387	/* Emit compute shader state */
388	r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
389
390	for (i = 0; i < get_compute_resource_num(); i++) {
391		if (resources[i].enabled) {
392			int j;
393			COMPUTE_DBG(ctx->screen, "resnum: %i, cdw: %i\n", i, cs->cdw);
394
395			for (j = 0; j < resources[i].cs_end; j++) {
396				if (resources[i].do_reloc[j]) {
397					assert(resources[i].bo);
398					evergreen_emit_ctx_reloc(ctx,
399						resources[i].bo,
400						resources[i].usage);
401				}
402
403				cs->buf[cs->cdw++] = resources[i].cs[j];
404			}
405
406			if (resources[i].bo) {
407				evergreen_emit_ctx_reloc(ctx,
408					resources[i].bo,
409					resources[i].usage);
410
411				///special case for textures
412				if (resources[i].do_reloc
413					[resources[i].cs_end] == 2) {
414					evergreen_emit_ctx_reloc(ctx,
415						resources[i].bo,
416						resources[i].usage);
417				}
418			}
419		}
420	}
421
422	/* Emit dispatch state and dispatch packet */
423	evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
424
425	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
426	 */
427	ctx->flags |= R600_CONTEXT_INVAL_READ_CACHES;
428	r600_flush_emit(ctx);
429
430#if 0
431	COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
432	for (i = 0; i < cs->cdw; i++) {
433		COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, ctx->cs->buf[i]);
434	}
435#endif
436
437	flush_flags = RADEON_FLUSH_ASYNC | RADEON_FLUSH_COMPUTE;
438	if (ctx->keep_tiling_flags) {
439		flush_flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
440	}
441
442	ctx->ws->cs_flush(ctx->rings.gfx.cs, flush_flags);
443
444	ctx->flags = 0;
445
446	COMPUTE_DBG(ctx->screen, "shader started\n");
447}
448
449
450/**
451 * Emit function for r600_cs_shader_state atom
452 */
453void evergreen_emit_cs_shader(
454		struct r600_context *rctx,
455		struct r600_atom *atom)
456{
457	struct r600_cs_shader_state *state =
458					(struct r600_cs_shader_state*)atom;
459	struct r600_pipe_compute *shader = state->shader;
460	struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
461	struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
462	uint64_t va;
463
464	va = r600_resource_va(&rctx->screen->screen, &kernel->code_bo->b.b);
465
466	r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
467	r600_write_value(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
468	r600_write_value(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
469			S_0288D4_NUM_GPRS(kernel->bc.ngpr)
470			| S_0288D4_STACK_SIZE(kernel->bc.nstack));
471	r600_write_value(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
472
473	r600_write_value(cs, PKT3C(PKT3_NOP, 0, 0));
474	r600_write_value(cs, r600_context_bo_reloc(rctx, &rctx->rings.gfx,
475							kernel->code_bo, RADEON_USAGE_READ));
476
477	rctx->flags |= R600_CONTEXT_INVAL_READ_CACHES;
478}
479
480static void evergreen_launch_grid(
481		struct pipe_context *ctx_,
482		const uint *block_layout, const uint *grid_layout,
483		uint32_t pc, const void *input)
484{
485	struct r600_context *ctx = (struct r600_context *)ctx_;
486
487#ifdef HAVE_OPENCL
488	COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);
489
490	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
491	if (!shader->kernels[pc].code_bo) {
492		void *p;
493		struct r600_kernel *kernel = &shader->kernels[pc];
494		r600_compute_shader_create(ctx_, kernel->llvm_module, &kernel->bc);
495		kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
496							kernel->bc.ndw * 4);
497		p = r600_buffer_mmap_sync_with_rings(ctx, kernel->code_bo, PIPE_TRANSFER_WRITE);
498		memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
499		ctx->ws->buffer_unmap(kernel->code_bo->cs_buf);
500	}
501#endif
502
503	ctx->cs_shader_state.kernel_index = pc;
504	evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
505	compute_emit_cs(ctx, block_layout, grid_layout);
506}
507
508static void evergreen_set_compute_resources(struct pipe_context * ctx_,
509		unsigned start, unsigned count,
510		struct pipe_surface ** surfaces)
511{
512	struct r600_context *ctx = (struct r600_context *)ctx_;
513	struct r600_surface **resources = (struct r600_surface **)surfaces;
514
515	COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
516			start, count);
517
518	for (int i = 0; i < count; i++)	{
519		/* The First two vertex buffers are reserved for parameters and
520		 * global buffers. */
521		unsigned vtx_id = 2 + i;
522		if (resources[i]) {
523			struct r600_resource_global *buffer =
524				(struct r600_resource_global*)
525				resources[i]->base.texture;
526			if (resources[i]->base.writable) {
527				assert(i+1 < 12);
528
529				evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
530				(struct r600_resource *)resources[i]->base.texture,
531				buffer->chunk->start_in_dw*4,
532				resources[i]->base.texture->width0);
533			}
534
535			evergreen_cs_set_vertex_buffer(ctx, vtx_id,
536					buffer->chunk->start_in_dw * 4,
537					resources[i]->base.texture);
538		}
539	}
540}
541
542static void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
543		unsigned start_slot, unsigned count,
544		struct pipe_sampler_view **views)
545{
546	struct r600_context *ctx = (struct r600_context *)ctx_;
547	struct r600_pipe_sampler_view **resource =
548		(struct r600_pipe_sampler_view **)views;
549
550	for (int i = 0; i < count; i++)	{
551		if (resource[i]) {
552			assert(i+1 < 12);
553			///FETCH0 = VTX0 (param buffer),
554			//FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
555			evergreen_set_tex_resource(ctx->cs_shader_state.shader, resource[i], i+2);
556		}
557	}
558}
559
560static void evergreen_bind_compute_sampler_states(
561	struct pipe_context *ctx_,
562	unsigned start_slot,
563	unsigned num_samplers,
564	void **samplers_)
565{
566	struct r600_context *ctx = (struct r600_context *)ctx_;
567	struct compute_sampler_state ** samplers =
568		(struct compute_sampler_state **)samplers_;
569
570	for (int i = 0; i < num_samplers; i++) {
571		if (samplers[i]) {
572			evergreen_set_sampler_resource(
573				ctx->cs_shader_state.shader, samplers[i], i);
574		}
575	}
576}
577
578static void evergreen_set_global_binding(
579	struct pipe_context *ctx_, unsigned first, unsigned n,
580	struct pipe_resource **resources,
581	uint32_t **handles)
582{
583	struct r600_context *ctx = (struct r600_context *)ctx_;
584	struct compute_memory_pool *pool = ctx->screen->global_pool;
585	struct r600_resource_global **buffers =
586		(struct r600_resource_global **)resources;
587
588	COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
589			first, n);
590
591	if (!resources) {
592		/* XXX: Unset */
593		return;
594	}
595
596	compute_memory_finalize_pending(pool, ctx_);
597
598	for (int i = 0; i < n; i++)
599	{
600		assert(resources[i]->target == PIPE_BUFFER);
601		assert(resources[i]->bind & PIPE_BIND_GLOBAL);
602
603		*(handles[i]) = buffers[i]->chunk->start_in_dw * 4;
604	}
605
606	evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
607	evergreen_cs_set_vertex_buffer(ctx, 1, 0,
608				(struct pipe_resource*)pool->bo);
609}
610
611/**
612 * This function initializes all the compute specific registers that need to
613 * be initialized for each compute command stream.  Registers that are common
614 * to both compute and 3D will be initialized at the beginning of each compute
615 * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
616 * packet requires that the shader type bit be set, we must initialize all
617 * context registers needed for compute in this function.  The registers
618 * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
619 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
620 * on the GPU family.
621 */
622void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
623{
624	struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
625	int num_threads;
626	int num_stack_entries;
627
628	/* since all required registers are initialised in the
629	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
630	 */
631	r600_init_command_buffer(cb, 256);
632	cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
633
634	/* This must be first. */
635	r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
636	r600_store_value(cb, 0x80000000);
637	r600_store_value(cb, 0x80000000);
638
639	/* We're setting config registers here. */
640	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
641	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
642
643	switch (ctx->family) {
644	case CHIP_CEDAR:
645	default:
646		num_threads = 128;
647		num_stack_entries = 256;
648		break;
649	case CHIP_REDWOOD:
650		num_threads = 128;
651		num_stack_entries = 256;
652		break;
653	case CHIP_JUNIPER:
654		num_threads = 128;
655		num_stack_entries = 512;
656		break;
657	case CHIP_CYPRESS:
658	case CHIP_HEMLOCK:
659		num_threads = 128;
660		num_stack_entries = 512;
661		break;
662	case CHIP_PALM:
663		num_threads = 128;
664		num_stack_entries = 256;
665		break;
666	case CHIP_SUMO:
667		num_threads = 128;
668		num_stack_entries = 256;
669		break;
670	case CHIP_SUMO2:
671		num_threads = 128;
672		num_stack_entries = 512;
673		break;
674	case CHIP_BARTS:
675		num_threads = 128;
676		num_stack_entries = 512;
677		break;
678	case CHIP_TURKS:
679		num_threads = 128;
680		num_stack_entries = 256;
681		break;
682	case CHIP_CAICOS:
683		num_threads = 128;
684		num_stack_entries = 256;
685		break;
686	}
687
688	/* Config Registers */
689	if (ctx->chip_class < CAYMAN)
690		evergreen_init_common_regs(cb, ctx->chip_class, ctx->family,
691					   ctx->screen->info.drm_minor);
692	else
693		cayman_init_common_regs(cb, ctx->chip_class, ctx->family,
694					ctx->screen->info.drm_minor);
695
696	/* The primitive type always needs to be POINTLIST for compute. */
697	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
698						V_008958_DI_PT_POINTLIST);
699
700	if (ctx->chip_class < CAYMAN) {
701
702		/* These registers control which simds can be used by each stage.
703		 * The default for these registers is 0xffffffff, which means
704		 * all simds are available for each stage.  It's possible we may
705		 * want to play around with these in the future, but for now
706		 * the default value is fine.
707		 *
708		 * R_008E20_SQ_STATIC_THREAD_MGMT1
709		 * R_008E24_SQ_STATIC_THREAD_MGMT2
710		 * R_008E28_SQ_STATIC_THREAD_MGMT3
711		 */
712
713		/* XXX: We may need to adjust the thread and stack resouce
714		 * values for 3D/compute interop */
715
716		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
717
718		/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
719		 * Set the number of threads used by the PS/VS/GS/ES stage to
720		 * 0.
721		 */
722		r600_store_value(cb, 0);
723
724		/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
725		 * Set the number of threads used by the CS (aka LS) stage to
726		 * the maximum number of threads and set the number of threads
727		 * for the HS stage to 0. */
728		r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
729
730		/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
731		 * Set the Control Flow stack entries to 0 for PS/VS stages */
732		r600_store_value(cb, 0);
733
734		/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
735		 * Set the Control Flow stack entries to 0 for GS/ES stages */
736		r600_store_value(cb, 0);
737
738		/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
739		 * Set the Contol Flow stack entries to 0 for the HS stage, and
740		 * set it to the maximum value for the CS (aka LS) stage. */
741		r600_store_value(cb,
742			S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
743	}
744
745	/* Context Registers */
746
747	if (ctx->chip_class < CAYMAN) {
748		/* workaround for hw issues with dyn gpr - must set all limits
749		 * to 240 instead of 0, 0x1e == 240 / 8
750		 */
751		r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
752				S_028838_PS_GPRS(0x1e) |
753				S_028838_VS_GPRS(0x1e) |
754				S_028838_GS_GPRS(0x1e) |
755				S_028838_ES_GPRS(0x1e) |
756				S_028838_HS_GPRS(0x1e) |
757				S_028838_LS_GPRS(0x1e));
758	}
759
760	/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
761	r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
762		S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
763
764	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
765
766	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
767						S_0286E8_TID_IN_GROUP_ENA
768						| S_0286E8_TGID_ENA
769						| S_0286E8_DISABLE_INDEX_PACK)
770						;
771
772	/* The LOOP_CONST registers are an optimizations for loops that allows
773	 * you to store the initial counter, increment value, and maximum
774	 * counter value in a register so that hardware can calculate the
775	 * correct number of iterations for the loop, so that you don't need
776	 * to have the loop counter in your shader code.  We don't currently use
777	 * this optimization, so we must keep track of the counter in the
778	 * shader and use a break instruction to exit loops.  However, the
779	 * hardware will still uses this register to determine when to exit a
780	 * loop, so we need to initialize the counter to 0, set the increment
781	 * value to 1 and the maximum counter value to the 4095 (0xfff) which
782	 * is the maximum value allowed.  This gives us a maximum of 4096
783	 * iterations for our loops, but hopefully our break instruction will
784	 * execute before some time before the 4096th iteration.
785	 */
786	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
787}
788
789void evergreen_init_compute_state_functions(struct r600_context *ctx)
790{
791	ctx->context.create_compute_state = evergreen_create_compute_state;
792	ctx->context.delete_compute_state = evergreen_delete_compute_state;
793	ctx->context.bind_compute_state = evergreen_bind_compute_state;
794//	 ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
795	ctx->context.set_compute_resources = evergreen_set_compute_resources;
796	ctx->context.set_compute_sampler_views = evergreen_set_cs_sampler_view;
797	ctx->context.bind_compute_sampler_states = evergreen_bind_compute_sampler_states;
798	ctx->context.set_global_binding = evergreen_set_global_binding;
799	ctx->context.launch_grid = evergreen_launch_grid;
800
801	/* We always use at least one vertex buffer for parameters (id = 1)*/
802	ctx->cs_vertex_buffer_state.enabled_mask =
803	ctx->cs_vertex_buffer_state.dirty_mask = 0x2;
804}
805
806
807struct pipe_resource *r600_compute_global_buffer_create(
808	struct pipe_screen *screen,
809	const struct pipe_resource *templ)
810{
811	struct r600_resource_global* result = NULL;
812	struct r600_screen* rscreen = NULL;
813	int size_in_dw = 0;
814
815	assert(templ->target == PIPE_BUFFER);
816	assert(templ->bind & PIPE_BIND_GLOBAL);
817	assert(templ->array_size == 1 || templ->array_size == 0);
818	assert(templ->depth0 == 1 || templ->depth0 == 0);
819	assert(templ->height0 == 1 || templ->height0 == 0);
820
821	result = (struct r600_resource_global*)
822	CALLOC(sizeof(struct r600_resource_global), 1);
823	rscreen = (struct r600_screen*)screen;
824
825	COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
826	COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
827			templ->array_size);
828
829	result->base.b.vtbl = &r600_global_buffer_vtbl;
830	result->base.b.b.screen = screen;
831	result->base.b.b = *templ;
832	pipe_reference_init(&result->base.b.b.reference, 1);
833
834	size_in_dw = (templ->width0+3) / 4;
835
836	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
837
838	if (result->chunk == NULL)
839	{
840		free(result);
841		return NULL;
842	}
843
844	return &result->base.b.b;
845}
846
847void r600_compute_global_buffer_destroy(
848	struct pipe_screen *screen,
849	struct pipe_resource *res)
850{
851	struct r600_resource_global* buffer = NULL;
852	struct r600_screen* rscreen = NULL;
853
854	assert(res->target == PIPE_BUFFER);
855	assert(res->bind & PIPE_BIND_GLOBAL);
856
857	buffer = (struct r600_resource_global*)res;
858	rscreen = (struct r600_screen*)screen;
859
860	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
861
862	buffer->chunk = NULL;
863	free(res);
864}
865
866void *r600_compute_global_transfer_map(
867	struct pipe_context *ctx_,
868	struct pipe_resource *resource,
869	unsigned level,
870	unsigned usage,
871	const struct pipe_box *box,
872	struct pipe_transfer **ptransfer)
873{
874	struct r600_context *rctx = (struct r600_context*)ctx_;
875	struct compute_memory_pool *pool = rctx->screen->global_pool;
876	struct pipe_transfer *transfer = util_slab_alloc(&rctx->pool_transfers);
877	struct r600_resource_global* buffer =
878		(struct r600_resource_global*)resource;
879	uint32_t* map;
880
881	compute_memory_finalize_pending(pool, ctx_);
882
883	assert(resource->target == PIPE_BUFFER);
884
885	COMPUTE_DBG(rctx->screen, "* r600_compute_global_get_transfer()\n"
886			"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
887			"width = %u, height = %u, depth = %u)\n", level, usage,
888			box->x, box->y, box->z, box->width, box->height,
889			box->depth);
890
891	transfer->resource = resource;
892	transfer->level = level;
893	transfer->usage = usage;
894	transfer->box = *box;
895	transfer->stride = 0;
896	transfer->layer_stride = 0;
897
898	assert(transfer->resource->target == PIPE_BUFFER);
899	assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
900	assert(transfer->box.x >= 0);
901	assert(transfer->box.y == 0);
902	assert(transfer->box.z == 0);
903
904	///TODO: do it better, mapping is not possible if the pool is too big
905
906	COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n");
907
908	if (!(map = r600_buffer_mmap_sync_with_rings(rctx, buffer->chunk->pool->bo, transfer->usage))) {
909		util_slab_free(&rctx->pool_transfers, transfer);
910		return NULL;
911	}
912
913	*ptransfer = transfer;
914
915	COMPUTE_DBG(rctx->screen, "Buffer: %p + %u (buffer offset in global memory) "
916		"+ %u (box.x)\n", map, buffer->chunk->start_in_dw, transfer->box.x);
917	return ((char*)(map + buffer->chunk->start_in_dw)) + transfer->box.x;
918}
919
920void r600_compute_global_transfer_unmap(
921	struct pipe_context *ctx_,
922	struct pipe_transfer* transfer)
923{
924	struct r600_context *ctx = NULL;
925	struct r600_resource_global* buffer = NULL;
926
927	assert(transfer->resource->target == PIPE_BUFFER);
928	assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
929
930	ctx = (struct r600_context *)ctx_;
931	buffer = (struct r600_resource_global*)transfer->resource;
932
933	COMPUTE_DBG(ctx->screen, "* r600_compute_global_transfer_unmap()\n");
934
935	ctx->ws->buffer_unmap(buffer->chunk->pool->bo->cs_buf);
936	util_slab_free(&ctx->pool_transfers, transfer);
937}
938
939void r600_compute_global_transfer_flush_region(
940	struct pipe_context *ctx_,
941	struct pipe_transfer *transfer,
942	const struct pipe_box *box)
943{
944	assert(0 && "TODO");
945}
946
947void r600_compute_global_transfer_inline_write(
948	struct pipe_context *pipe,
949	struct pipe_resource *resource,
950	unsigned level,
951	unsigned usage,
952	const struct pipe_box *box,
953	const void *data,
954	unsigned stride,
955	unsigned layer_stride)
956{
957	assert(0 && "TODO");
958}
959