evergreen_compute.c revision e1dcd333e4e0757f3fd2b010bc14b36340b70c39
1/*
2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 *      Adam Rak <adam.rak@streamnovation.com>
25 */
26
27#include <stdio.h>
28#include <errno.h>
29#include "pipe/p_defines.h"
30#include "pipe/p_state.h"
31#include "pipe/p_context.h"
32#include "util/u_blitter.h"
33#include "util/list.h"
34#include "util/u_transfer.h"
35#include "util/u_surface.h"
36#include "util/u_pack_color.h"
37#include "util/u_memory.h"
38#include "util/u_inlines.h"
39#include "util/u_framebuffer.h"
40#include "pipebuffer/pb_buffer.h"
41#include "evergreend.h"
42#include "r600_shader.h"
43#include "r600_pipe.h"
44#include "r600_formats.h"
45#include "evergreen_compute.h"
46#include "evergreen_compute_internal.h"
47#include "compute_memory_pool.h"
48#include "sb/sb_public.h"
49#ifdef HAVE_OPENCL
50#include "radeon/radeon_llvm_util.h"
51#endif
52#include "radeon/radeon_elf_util.h"
53#include <inttypes.h>
54
55/**
56RAT0 is for global binding write
57VTX1 is for global binding read
58
59for wrting images RAT1...
60for reading images TEX2...
61  TEX2-RAT1 is paired
62
63TEX2... consumes the same fetch resources, that VTX2... would consume
64
65CONST0 and VTX0 is for parameters
66  CONST0 is binding smaller input parameter buffer, and for constant indexing,
67  also constant cached
68  VTX0 is for indirect/non-constant indexing, or if the input is bigger than
69  the constant cache can handle
70
71RAT-s are limited to 12, so we can only bind at most 11 texture for writing
72because we reserve RAT0 for global bindings. With byteaddressing enabled,
73we should reserve another one too.=> 10 image binding for writing max.
74
75from Nvidia OpenCL:
76  CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
77  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
78
79so 10 for writing is enough. 176 is the max for reading according to the docs
80
81writable images should be listed first < 10, so their id corresponds to RAT(id+1)
82writable images will consume TEX slots, VTX slots too because of linear indexing
83
84*/
85
86struct r600_resource* r600_compute_buffer_alloc_vram(
87       struct r600_screen *screen,
88       unsigned size)
89{
90	struct pipe_resource * buffer = NULL;
91	assert(size);
92
93	buffer = pipe_buffer_create(
94		(struct pipe_screen*) screen,
95		PIPE_BIND_CUSTOM,
96		PIPE_USAGE_IMMUTABLE,
97		size);
98
99	return (struct r600_resource *)buffer;
100}
101
102
103static void evergreen_set_rat(
104	struct r600_pipe_compute *pipe,
105	unsigned id,
106	struct r600_resource* bo,
107	int start,
108	int size)
109{
110	struct pipe_surface rat_templ;
111	struct r600_surface *surf = NULL;
112	struct r600_context *rctx = NULL;
113
114	assert(id < 12);
115	assert((size & 3) == 0);
116	assert((start & 0xFF) == 0);
117
118	rctx = pipe->ctx;
119
120	COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
121
122	/* Create the RAT surface */
123	memset(&rat_templ, 0, sizeof(rat_templ));
124	rat_templ.format = PIPE_FORMAT_R32_UINT;
125	rat_templ.u.tex.level = 0;
126	rat_templ.u.tex.first_layer = 0;
127	rat_templ.u.tex.last_layer = 0;
128
129	/* Add the RAT the list of color buffers */
130	pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
131		(struct pipe_context *)pipe->ctx,
132		(struct pipe_resource *)bo, &rat_templ);
133
134	/* Update the number of color buffers */
135	pipe->ctx->framebuffer.state.nr_cbufs =
136		MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
137
138	/* Update the cb_target_mask
139	 * XXX: I think this is a potential spot for bugs once we start doing
140	 * GL interop.  cb_target_mask may be modified in the 3D sections
141	 * of this driver. */
142	pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
143
144	surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
145	evergreen_init_color_surface_rat(rctx, surf);
146}
147
148static void evergreen_cs_set_vertex_buffer(
149	struct r600_context * rctx,
150	unsigned vb_index,
151	unsigned offset,
152	struct pipe_resource * buffer)
153{
154	struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
155	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
156	vb->stride = 1;
157	vb->buffer_offset = offset;
158	vb->buffer = buffer;
159	vb->user_buffer = NULL;
160
161	/* The vertex instructions in the compute shaders use the texture cache,
162	 * so we need to invalidate it. */
163	rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
164	state->enabled_mask |= 1 << vb_index;
165	state->dirty_mask |= 1 << vb_index;
166	r600_mark_atom_dirty(rctx, &state->atom);
167}
168
169static void evergreen_cs_set_constant_buffer(
170	struct r600_context * rctx,
171	unsigned cb_index,
172	unsigned offset,
173	unsigned size,
174	struct pipe_resource * buffer)
175{
176	struct pipe_constant_buffer cb;
177	cb.buffer_size = size;
178	cb.buffer_offset = offset;
179	cb.buffer = buffer;
180	cb.user_buffer = NULL;
181
182	rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
183}
184
185static const struct u_resource_vtbl r600_global_buffer_vtbl =
186{
187	u_default_resource_get_handle, /* get_handle */
188	r600_compute_global_buffer_destroy, /* resource_destroy */
189	r600_compute_global_transfer_map, /* transfer_map */
190	r600_compute_global_transfer_flush_region,/* transfer_flush_region */
191	r600_compute_global_transfer_unmap, /* transfer_unmap */
192	r600_compute_global_transfer_inline_write /* transfer_inline_write */
193};
194
195
196void *evergreen_create_compute_state(
197	struct pipe_context *ctx_,
198	const const struct pipe_compute_state *cso)
199{
200	struct r600_context *ctx = (struct r600_context *)ctx_;
201	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
202#ifdef HAVE_OPENCL
203	const struct pipe_llvm_program_header * header;
204	const char *code;
205	void *p;
206	boolean use_kill;
207
208	COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
209	header = cso->prog;
210	code = cso->prog + sizeof(struct pipe_llvm_program_header);
211#if HAVE_LLVM < 0x0306
212        (void)use_kill;
213	(void)p;
214	shader->llvm_ctx = LLVMContextCreate();
215	shader->num_kernels = radeon_llvm_get_num_kernels(shader->llvm_ctx,
216				code, header->num_bytes);
217	shader->kernels = CALLOC(sizeof(struct r600_kernel),
218				shader->num_kernels);
219	{
220		unsigned i;
221		for (i = 0; i < shader->num_kernels; i++) {
222			struct r600_kernel *kernel = &shader->kernels[i];
223			kernel->llvm_module = radeon_llvm_get_kernel_module(
224				shader->llvm_ctx, i, code, header->num_bytes);
225		}
226	}
227#else
228	memset(&shader->binary, 0, sizeof(shader->binary));
229	radeon_elf_read(code, header->num_bytes, &shader->binary);
230	r600_create_shader(&shader->bc, &shader->binary, &use_kill);
231
232	shader->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
233							shader->bc.ndw * 4);
234	p = r600_buffer_map_sync_with_rings(&ctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
235	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
236	ctx->b.ws->buffer_unmap(shader->code_bo->buf);
237#endif
238#endif
239
240	shader->ctx = ctx;
241	shader->local_size = cso->req_local_mem;
242	shader->private_size = cso->req_private_mem;
243	shader->input_size = cso->req_input_mem;
244
245	return shader;
246}
247
248void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
249{
250	struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
251
252	if (!shader)
253		return;
254
255	FREE(shader);
256}
257
258static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
259{
260	struct r600_context *ctx = (struct r600_context *)ctx_;
261
262	COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
263
264	ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
265}
266
267/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
268 * kernel parameters there are implicit parameters that need to be stored
269 * in the vertex buffer as well.  Here is how these parameters are organized in
270 * the buffer:
271 *
272 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
273 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
274 * DWORDS 6-8: Number of work items within each work group in each dimension
275 *             (x,y,z)
276 * DWORDS 9+ : Kernel parameters
277 */
278void evergreen_compute_upload_input(
279	struct pipe_context *ctx_,
280	const uint *block_layout,
281	const uint *grid_layout,
282	const void *input)
283{
284	struct r600_context *ctx = (struct r600_context *)ctx_;
285	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
286	unsigned i;
287	/* We need to reserve 9 dwords (36 bytes) for implicit kernel
288	 * parameters.
289	 */
290	unsigned input_size = shader->input_size + 36;
291	uint32_t * num_work_groups_start;
292	uint32_t * global_size_start;
293	uint32_t * local_size_start;
294	uint32_t * kernel_parameters_start;
295	struct pipe_box box;
296	struct pipe_transfer *transfer = NULL;
297
298	if (shader->input_size == 0) {
299		return;
300	}
301
302	if (!shader->kernel_param) {
303		/* Add space for the grid dimensions */
304		shader->kernel_param = (struct r600_resource *)
305			pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
306					PIPE_USAGE_IMMUTABLE, input_size);
307	}
308
309	u_box_1d(0, input_size, &box);
310	num_work_groups_start = ctx_->transfer_map(ctx_,
311			(struct pipe_resource*)shader->kernel_param,
312			0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
313			&box, &transfer);
314	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
315	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
316	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
317
318	/* Copy the work group size */
319	memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
320
321	/* Copy the global size */
322	for (i = 0; i < 3; i++) {
323		global_size_start[i] = grid_layout[i] * block_layout[i];
324	}
325
326	/* Copy the local dimensions */
327	memcpy(local_size_start, block_layout, 3 * sizeof(uint));
328
329	/* Copy the kernel inputs */
330	memcpy(kernel_parameters_start, input, shader->input_size);
331
332	for (i = 0; i < (input_size / 4); i++) {
333		COMPUTE_DBG(ctx->screen, "input %i : %u\n", i,
334			((unsigned*)num_work_groups_start)[i]);
335	}
336
337	ctx_->transfer_unmap(ctx_, transfer);
338
339	/* ID=0 is reserved for the parameters */
340	evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
341			(struct pipe_resource*)shader->kernel_param);
342}
343
344static void evergreen_emit_direct_dispatch(
345		struct r600_context *rctx,
346		const uint *block_layout, const uint *grid_layout)
347{
348	int i;
349	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
350	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
351	unsigned num_waves;
352	unsigned num_pipes = rctx->screen->b.info.r600_max_pipes;
353	unsigned wave_divisor = (16 * num_pipes);
354	int group_size = 1;
355	int grid_size = 1;
356	unsigned lds_size = shader->local_size / 4 +
357#if HAVE_LLVM < 0x0306
358		shader->active_kernel->bc.nlds_dw;
359#else
360		shader->bc.nlds_dw;
361#endif
362
363
364	/* Calculate group_size/grid_size */
365	for (i = 0; i < 3; i++) {
366		group_size *= block_layout[i];
367	}
368
369	for (i = 0; i < 3; i++)	{
370		grid_size *= grid_layout[i];
371	}
372
373	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
374	num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
375			wave_divisor - 1) / wave_divisor;
376
377	COMPUTE_DBG(rctx->screen, "Using %u pipes, "
378				"%u wavefronts per thread block, "
379				"allocating %u dwords lds.\n",
380				num_pipes, num_waves, lds_size);
381
382	radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
383
384	radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
385	radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
386	radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
387	radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
388
389	radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
390								group_size);
391
392	radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
393	radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
394	radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
395	radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
396
397	if (rctx->b.chip_class < CAYMAN) {
398		assert(lds_size <= 8192);
399	} else {
400		/* Cayman appears to have a slightly smaller limit, see the
401		 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
402		assert(lds_size <= 8160);
403	}
404
405	radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
406					lds_size | (num_waves << 14));
407
408	/* Dispatch packet */
409	radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
410	radeon_emit(cs, grid_layout[0]);
411	radeon_emit(cs, grid_layout[1]);
412	radeon_emit(cs, grid_layout[2]);
413	/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
414	radeon_emit(cs, 1);
415}
416
417static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
418		const uint *grid_layout)
419{
420	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
421	unsigned i;
422
423	/* make sure that the gfx ring is only one active */
424	if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) {
425		ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
426	}
427
428	/* Initialize all the compute-related registers.
429	 *
430	 * See evergreen_init_atom_start_compute_cs() in this file for the list
431	 * of registers initialized by the start_compute_cs_cmd atom.
432	 */
433	r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
434
435	/* emit config state */
436	if (ctx->b.chip_class == EVERGREEN)
437		r600_emit_atom(ctx, &ctx->config_state.atom);
438
439	ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
440	r600_flush_emit(ctx);
441
442	/* Emit colorbuffers. */
443	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
444	for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
445		struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
446		unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx,
447						       (struct r600_resource*)cb->base.texture,
448						       RADEON_USAGE_READWRITE,
449						       RADEON_PRIO_SHADER_RW_BUFFER);
450
451		radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
452		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
453		radeon_emit(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
454		radeon_emit(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
455		radeon_emit(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
456		radeon_emit(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
457		radeon_emit(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
458		radeon_emit(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */
459
460		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
461		radeon_emit(cs, reloc);
462
463		if (!ctx->keep_tiling_flags) {
464			radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
465			radeon_emit(cs, reloc);
466		}
467
468		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
469		radeon_emit(cs, reloc);
470	}
471	if (ctx->keep_tiling_flags) {
472		for (; i < 8 ; i++) {
473			radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
474						       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
475		}
476		for (; i < 12; i++) {
477			radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
478						       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
479		}
480	}
481
482	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
483	radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
484					ctx->compute_cb_target_mask);
485
486
487	/* Emit vertex buffer state */
488	ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
489	r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
490
491	/* Emit constant buffer state */
492	r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
493
494	/* Emit sampler state */
495	r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
496
497	/* Emit sampler view (texture resource) state */
498	r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
499
500	/* Emit compute shader state */
501	r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
502
503	/* Emit dispatch state and dispatch packet */
504	evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
505
506	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
507	 */
508	ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
509		      R600_CONTEXT_INV_VERTEX_CACHE |
510	              R600_CONTEXT_INV_TEX_CACHE;
511	r600_flush_emit(ctx);
512	ctx->b.flags = 0;
513
514	if (ctx->b.chip_class >= CAYMAN) {
515		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
516		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
517		/* DEALLOC_STATE prevents the GPU from hanging when a
518		 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
519		 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
520		 */
521		cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
522		cs->buf[cs->cdw++] = 0;
523	}
524
525#if 0
526	COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
527	for (i = 0; i < cs->cdw; i++) {
528		COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
529	}
530#endif
531
532}
533
534
535/**
536 * Emit function for r600_cs_shader_state atom
537 */
538void evergreen_emit_cs_shader(
539		struct r600_context *rctx,
540		struct r600_atom *atom)
541{
542	struct r600_cs_shader_state *state =
543					(struct r600_cs_shader_state*)atom;
544	struct r600_pipe_compute *shader = state->shader;
545	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
546	uint64_t va;
547	struct r600_resource *code_bo;
548	unsigned ngpr, nstack;
549
550#if HAVE_LLVM < 0x0306
551	struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
552	code_bo = kernel->code_bo;
553	va = kernel->code_bo->gpu_address;
554	ngpr = kernel->bc.ngpr;
555	nstack = kernel->bc.nstack;
556#else
557	code_bo = shader->code_bo;
558	va = shader->code_bo->gpu_address + state->pc;
559	ngpr = shader->bc.ngpr;
560	nstack = shader->bc.nstack;
561#endif
562
563	radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
564	radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
565	radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
566			S_0288D4_NUM_GPRS(ngpr)
567			| S_0288D4_STACK_SIZE(nstack));
568	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
569
570	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
571	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
572					      code_bo, RADEON_USAGE_READ,
573					      RADEON_PRIO_USER_SHADER));
574}
575
576static void evergreen_launch_grid(
577		struct pipe_context *ctx_,
578		const uint *block_layout, const uint *grid_layout,
579		uint32_t pc, const void *input)
580{
581	struct r600_context *ctx = (struct r600_context *)ctx_;
582#ifdef HAVE_OPENCL
583	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
584	boolean use_kill;
585
586#if HAVE_LLVM < 0x0306
587	struct r600_kernel *kernel = &shader->kernels[pc];
588	(void)use_kill;
589        if (!kernel->code_bo) {
590                void *p;
591                struct r600_bytecode *bc = &kernel->bc;
592                LLVMModuleRef mod = kernel->llvm_module;
593                boolean use_kill = false;
594                bool dump = (ctx->screen->b.debug_flags & DBG_CS) != 0;
595                unsigned use_sb = ctx->screen->b.debug_flags & DBG_SB_CS;
596                unsigned sb_disasm = use_sb ||
597                        (ctx->screen->b.debug_flags & DBG_SB_DISASM);
598
599                r600_bytecode_init(bc, ctx->b.chip_class, ctx->b.family,
600                           ctx->screen->has_compressed_msaa_texturing);
601                bc->type = TGSI_PROCESSOR_COMPUTE;
602                bc->isa = ctx->isa;
603                r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump, &ctx->b.debug);
604
605                if (dump && !sb_disasm) {
606                        r600_bytecode_disasm(bc);
607                } else if ((dump && sb_disasm) || use_sb) {
608                        if (r600_sb_bytecode_process(ctx, bc, NULL, dump, use_sb))
609                                R600_ERR("r600_sb_bytecode_process failed!\n");
610                }
611
612                kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
613                                                        kernel->bc.ndw * 4);
614                p = r600_buffer_map_sync_with_rings(&ctx->b, kernel->code_bo, PIPE_TRANSFER_WRITE);
615                memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
616                ctx->b.ws->buffer_unmap(kernel->code_bo->buf);
617        }
618	shader->active_kernel = kernel;
619	ctx->cs_shader_state.kernel_index = pc;
620#else
621	ctx->cs_shader_state.pc = pc;
622	/* Get the config information for this kernel. */
623	r600_shader_binary_read_config(&shader->binary, &shader->bc, pc, &use_kill);
624#endif
625#endif
626
627	COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);
628
629
630	evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
631	compute_emit_cs(ctx, block_layout, grid_layout);
632}
633
634static void evergreen_set_compute_resources(struct pipe_context * ctx_,
635		unsigned start, unsigned count,
636		struct pipe_surface ** surfaces)
637{
638	struct r600_context *ctx = (struct r600_context *)ctx_;
639	struct r600_surface **resources = (struct r600_surface **)surfaces;
640
641	COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
642			start, count);
643
644	for (unsigned i = 0; i < count; i++) {
645		/* The First two vertex buffers are reserved for parameters and
646		 * global buffers. */
647		unsigned vtx_id = 2 + i;
648		if (resources[i]) {
649			struct r600_resource_global *buffer =
650				(struct r600_resource_global*)
651				resources[i]->base.texture;
652			if (resources[i]->base.writable) {
653				assert(i+1 < 12);
654
655				evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
656				(struct r600_resource *)resources[i]->base.texture,
657				buffer->chunk->start_in_dw*4,
658				resources[i]->base.texture->width0);
659			}
660
661			evergreen_cs_set_vertex_buffer(ctx, vtx_id,
662					buffer->chunk->start_in_dw * 4,
663					resources[i]->base.texture);
664		}
665	}
666}
667
668static void evergreen_set_global_binding(
669	struct pipe_context *ctx_, unsigned first, unsigned n,
670	struct pipe_resource **resources,
671	uint32_t **handles)
672{
673	struct r600_context *ctx = (struct r600_context *)ctx_;
674	struct compute_memory_pool *pool = ctx->screen->global_pool;
675	struct r600_resource_global **buffers =
676		(struct r600_resource_global **)resources;
677	unsigned i;
678
679	COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
680			first, n);
681
682	if (!resources) {
683		/* XXX: Unset */
684		return;
685	}
686
687	/* We mark these items for promotion to the pool if they
688	 * aren't already there */
689	for (i = first; i < first + n; i++) {
690		struct compute_memory_item *item = buffers[i]->chunk;
691
692		if (!is_item_in_pool(item))
693			buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
694	}
695
696	if (compute_memory_finalize_pending(pool, ctx_) == -1) {
697		/* XXX: Unset */
698		return;
699	}
700
701	for (i = first; i < first + n; i++)
702	{
703		uint32_t buffer_offset;
704		uint32_t handle;
705		assert(resources[i]->target == PIPE_BUFFER);
706		assert(resources[i]->bind & PIPE_BIND_GLOBAL);
707
708		buffer_offset = util_le32_to_cpu(*(handles[i]));
709		handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
710
711		*(handles[i]) = util_cpu_to_le32(handle);
712	}
713
714	evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
715	evergreen_cs_set_vertex_buffer(ctx, 1, 0,
716				(struct pipe_resource*)pool->bo);
717}
718
719/**
720 * This function initializes all the compute specific registers that need to
721 * be initialized for each compute command stream.  Registers that are common
722 * to both compute and 3D will be initialized at the beginning of each compute
723 * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
724 * packet requires that the shader type bit be set, we must initialize all
725 * context registers needed for compute in this function.  The registers
726 * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
727 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
728 * on the GPU family.
729 */
730void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
731{
732	struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
733	int num_threads;
734	int num_stack_entries;
735
736	/* since all required registers are initialized in the
737	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
738	 */
739	r600_init_command_buffer(cb, 256);
740	cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
741
742	/* This must be first. */
743	r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
744	r600_store_value(cb, 0x80000000);
745	r600_store_value(cb, 0x80000000);
746
747	/* We're setting config registers here. */
748	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
749	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
750
751	switch (ctx->b.family) {
752	case CHIP_CEDAR:
753	default:
754		num_threads = 128;
755		num_stack_entries = 256;
756		break;
757	case CHIP_REDWOOD:
758		num_threads = 128;
759		num_stack_entries = 256;
760		break;
761	case CHIP_JUNIPER:
762		num_threads = 128;
763		num_stack_entries = 512;
764		break;
765	case CHIP_CYPRESS:
766	case CHIP_HEMLOCK:
767		num_threads = 128;
768		num_stack_entries = 512;
769		break;
770	case CHIP_PALM:
771		num_threads = 128;
772		num_stack_entries = 256;
773		break;
774	case CHIP_SUMO:
775		num_threads = 128;
776		num_stack_entries = 256;
777		break;
778	case CHIP_SUMO2:
779		num_threads = 128;
780		num_stack_entries = 512;
781		break;
782	case CHIP_BARTS:
783		num_threads = 128;
784		num_stack_entries = 512;
785		break;
786	case CHIP_TURKS:
787		num_threads = 128;
788		num_stack_entries = 256;
789		break;
790	case CHIP_CAICOS:
791		num_threads = 128;
792		num_stack_entries = 256;
793		break;
794	}
795
796	/* Config Registers */
797	if (ctx->b.chip_class < CAYMAN)
798		evergreen_init_common_regs(ctx, cb, ctx->b.chip_class, ctx->b.family,
799					   ctx->screen->b.info.drm_minor);
800	else
801		cayman_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
802					ctx->screen->b.info.drm_minor);
803
804	/* The primitive type always needs to be POINTLIST for compute. */
805	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
806						V_008958_DI_PT_POINTLIST);
807
808	if (ctx->b.chip_class < CAYMAN) {
809
810		/* These registers control which simds can be used by each stage.
811		 * The default for these registers is 0xffffffff, which means
812		 * all simds are available for each stage.  It's possible we may
813		 * want to play around with these in the future, but for now
814		 * the default value is fine.
815		 *
816		 * R_008E20_SQ_STATIC_THREAD_MGMT1
817		 * R_008E24_SQ_STATIC_THREAD_MGMT2
818		 * R_008E28_SQ_STATIC_THREAD_MGMT3
819		 */
820
821		/* XXX: We may need to adjust the thread and stack resource
822		 * values for 3D/compute interop */
823
824		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
825
826		/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
827		 * Set the number of threads used by the PS/VS/GS/ES stage to
828		 * 0.
829		 */
830		r600_store_value(cb, 0);
831
832		/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
833		 * Set the number of threads used by the CS (aka LS) stage to
834		 * the maximum number of threads and set the number of threads
835		 * for the HS stage to 0. */
836		r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
837
838		/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
839		 * Set the Control Flow stack entries to 0 for PS/VS stages */
840		r600_store_value(cb, 0);
841
842		/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
843		 * Set the Control Flow stack entries to 0 for GS/ES stages */
844		r600_store_value(cb, 0);
845
846		/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
847		 * Set the Contol Flow stack entries to 0 for the HS stage, and
848		 * set it to the maximum value for the CS (aka LS) stage. */
849		r600_store_value(cb,
850			S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
851	}
852	/* Give the compute shader all the available LDS space.
853	 * NOTE: This only sets the maximum number of dwords that a compute
854	 * shader can allocate.  When a shader is executed, we still need to
855	 * allocate the appropriate amount of LDS dwords using the
856	 * CM_R_0288E8_SQ_LDS_ALLOC register.
857	 */
858	if (ctx->b.chip_class < CAYMAN) {
859		r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
860			S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
861	} else {
862		r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
863			S_0286FC_NUM_PS_LDS(0) |
864			S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
865	}
866
867	/* Context Registers */
868
869	if (ctx->b.chip_class < CAYMAN) {
870		/* workaround for hw issues with dyn gpr - must set all limits
871		 * to 240 instead of 0, 0x1e == 240 / 8
872		 */
873		r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
874				S_028838_PS_GPRS(0x1e) |
875				S_028838_VS_GPRS(0x1e) |
876				S_028838_GS_GPRS(0x1e) |
877				S_028838_ES_GPRS(0x1e) |
878				S_028838_HS_GPRS(0x1e) |
879				S_028838_LS_GPRS(0x1e));
880	}
881
882	/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
883	r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
884		S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
885
886	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
887
888	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
889						S_0286E8_TID_IN_GROUP_ENA
890						| S_0286E8_TGID_ENA
891						| S_0286E8_DISABLE_INDEX_PACK)
892						;
893
894	/* The LOOP_CONST registers are an optimizations for loops that allows
895	 * you to store the initial counter, increment value, and maximum
896	 * counter value in a register so that hardware can calculate the
897	 * correct number of iterations for the loop, so that you don't need
898	 * to have the loop counter in your shader code.  We don't currently use
899	 * this optimization, so we must keep track of the counter in the
900	 * shader and use a break instruction to exit loops.  However, the
901	 * hardware will still uses this register to determine when to exit a
902	 * loop, so we need to initialize the counter to 0, set the increment
903	 * value to 1 and the maximum counter value to the 4095 (0xfff) which
904	 * is the maximum value allowed.  This gives us a maximum of 4096
905	 * iterations for our loops, but hopefully our break instruction will
906	 * execute before some time before the 4096th iteration.
907	 */
908	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
909}
910
911void evergreen_init_compute_state_functions(struct r600_context *ctx)
912{
913	ctx->b.b.create_compute_state = evergreen_create_compute_state;
914	ctx->b.b.delete_compute_state = evergreen_delete_compute_state;
915	ctx->b.b.bind_compute_state = evergreen_bind_compute_state;
916//	 ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
917	ctx->b.b.set_compute_resources = evergreen_set_compute_resources;
918	ctx->b.b.set_global_binding = evergreen_set_global_binding;
919	ctx->b.b.launch_grid = evergreen_launch_grid;
920
921}
922
923struct pipe_resource *r600_compute_global_buffer_create(
924	struct pipe_screen *screen,
925	const struct pipe_resource *templ)
926{
927	struct r600_resource_global* result = NULL;
928	struct r600_screen* rscreen = NULL;
929	int size_in_dw = 0;
930
931	assert(templ->target == PIPE_BUFFER);
932	assert(templ->bind & PIPE_BIND_GLOBAL);
933	assert(templ->array_size == 1 || templ->array_size == 0);
934	assert(templ->depth0 == 1 || templ->depth0 == 0);
935	assert(templ->height0 == 1 || templ->height0 == 0);
936
937	result = (struct r600_resource_global*)
938	CALLOC(sizeof(struct r600_resource_global), 1);
939	rscreen = (struct r600_screen*)screen;
940
941	COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
942	COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
943			templ->array_size);
944
945	result->base.b.vtbl = &r600_global_buffer_vtbl;
946	result->base.b.b.screen = screen;
947	result->base.b.b = *templ;
948	pipe_reference_init(&result->base.b.b.reference, 1);
949
950	size_in_dw = (templ->width0+3) / 4;
951
952	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
953
954	if (result->chunk == NULL)
955	{
956		free(result);
957		return NULL;
958	}
959
960	return &result->base.b.b;
961}
962
963void r600_compute_global_buffer_destroy(
964	struct pipe_screen *screen,
965	struct pipe_resource *res)
966{
967	struct r600_resource_global* buffer = NULL;
968	struct r600_screen* rscreen = NULL;
969
970	assert(res->target == PIPE_BUFFER);
971	assert(res->bind & PIPE_BIND_GLOBAL);
972
973	buffer = (struct r600_resource_global*)res;
974	rscreen = (struct r600_screen*)screen;
975
976	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
977
978	buffer->chunk = NULL;
979	free(res);
980}
981
982void *r600_compute_global_transfer_map(
983	struct pipe_context *ctx_,
984	struct pipe_resource *resource,
985	unsigned level,
986	unsigned usage,
987	const struct pipe_box *box,
988	struct pipe_transfer **ptransfer)
989{
990	struct r600_context *rctx = (struct r600_context*)ctx_;
991	struct compute_memory_pool *pool = rctx->screen->global_pool;
992	struct r600_resource_global* buffer =
993		(struct r600_resource_global*)resource;
994
995	struct compute_memory_item *item = buffer->chunk;
996	struct pipe_resource *dst = NULL;
997	unsigned offset = box->x;
998
999	if (is_item_in_pool(item)) {
1000		compute_memory_demote_item(pool, item, ctx_);
1001	}
1002	else {
1003		if (item->real_buffer == NULL) {
1004			item->real_buffer =
1005					r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1006		}
1007	}
1008
1009	dst = (struct pipe_resource*)item->real_buffer;
1010
1011	if (usage & PIPE_TRANSFER_READ)
1012		buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1013
1014	COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1015			"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1016			"width = %u, height = %u, depth = %u)\n", level, usage,
1017			box->x, box->y, box->z, box->width, box->height,
1018			box->depth);
1019	COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1020		"%u (box.x)\n", item->id, box->x);
1021
1022
1023	assert(resource->target == PIPE_BUFFER);
1024	assert(resource->bind & PIPE_BIND_GLOBAL);
1025	assert(box->x >= 0);
1026	assert(box->y == 0);
1027	assert(box->z == 0);
1028
1029	///TODO: do it better, mapping is not possible if the pool is too big
1030	return pipe_buffer_map_range(ctx_, dst,
1031			offset, box->width, usage, ptransfer);
1032}
1033
1034void r600_compute_global_transfer_unmap(
1035	struct pipe_context *ctx_,
1036	struct pipe_transfer* transfer)
1037{
1038	/* struct r600_resource_global are not real resources, they just map
1039	 * to an offset within the compute memory pool.  The function
1040	 * r600_compute_global_transfer_map() maps the memory pool
1041	 * resource rather than the struct r600_resource_global passed to
1042	 * it as an argument and then initalizes ptransfer->resource with
1043	 * the memory pool resource (via pipe_buffer_map_range).
1044	 * When transfer_unmap is called it uses the memory pool's
1045	 * vtable which calls r600_buffer_transfer_map() rather than
1046	 * this function.
1047	 */
1048	assert (!"This function should not be called");
1049}
1050
1051void r600_compute_global_transfer_flush_region(
1052	struct pipe_context *ctx_,
1053	struct pipe_transfer *transfer,
1054	const struct pipe_box *box)
1055{
1056	assert(0 && "TODO");
1057}
1058
1059void r600_compute_global_transfer_inline_write(
1060	struct pipe_context *pipe,
1061	struct pipe_resource *resource,
1062	unsigned level,
1063	unsigned usage,
1064	const struct pipe_box *box,
1065	const void *data,
1066	unsigned stride,
1067	unsigned layer_stride)
1068{
1069	assert(0 && "TODO");
1070}
1071