evergreen_compute.c revision b9e41b587fb15458c9b0c21b10d421d882083e27
1/*
2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 *      Adam Rak <adam.rak@streamnovation.com>
25 */
26
27#include <stdio.h>
28#include <errno.h>
29#include "pipe/p_defines.h"
30#include "pipe/p_state.h"
31#include "pipe/p_context.h"
32#include "util/u_blitter.h"
33#include "util/u_double_list.h"
34#include "util/u_transfer.h"
35#include "util/u_surface.h"
36#include "util/u_pack_color.h"
37#include "util/u_memory.h"
38#include "util/u_inlines.h"
39#include "util/u_framebuffer.h"
40#include "pipebuffer/pb_buffer.h"
41#include "evergreend.h"
42#include "r600_shader.h"
43#include "r600_pipe.h"
44#include "r600_formats.h"
45#include "evergreen_compute.h"
46#include "evergreen_compute_internal.h"
47#include "compute_memory_pool.h"
48#include "sb/sb_public.h"
49#ifdef HAVE_OPENCL
50#include "radeon/radeon_llvm_util.h"
51#endif
52#include "radeon/radeon_elf_util.h"
53#include <inttypes.h>
54
55/**
56RAT0 is for global binding write
57VTX1 is for global binding read
58
59for wrting images RAT1...
60for reading images TEX2...
61  TEX2-RAT1 is paired
62
63TEX2... consumes the same fetch resources, that VTX2... would consume
64
65CONST0 and VTX0 is for parameters
66  CONST0 is binding smaller input parameter buffer, and for constant indexing,
67  also constant cached
68  VTX0 is for indirect/non-constant indexing, or if the input is bigger than
69  the constant cache can handle
70
71RAT-s are limited to 12, so we can only bind at most 11 texture for writing
72because we reserve RAT0 for global bindings. With byteaddressing enabled,
73we should reserve another one too.=> 10 image binding for writing max.
74
75from Nvidia OpenCL:
76  CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
77  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
78
79so 10 for writing is enough. 176 is the max for reading according to the docs
80
81writable images should be listed first < 10, so their id corresponds to RAT(id+1)
82writable images will consume TEX slots, VTX slots too because of linear indexing
83
84*/
85
86struct r600_resource* r600_compute_buffer_alloc_vram(
87       struct r600_screen *screen,
88       unsigned size)
89{
90	struct pipe_resource * buffer = NULL;
91	assert(size);
92
93	buffer = pipe_buffer_create(
94		(struct pipe_screen*) screen,
95		PIPE_BIND_CUSTOM,
96		PIPE_USAGE_IMMUTABLE,
97		size);
98
99	return (struct r600_resource *)buffer;
100}
101
102
103static void evergreen_set_rat(
104	struct r600_pipe_compute *pipe,
105	unsigned id,
106	struct r600_resource* bo,
107	int start,
108	int size)
109{
110	struct pipe_surface rat_templ;
111	struct r600_surface *surf = NULL;
112	struct r600_context *rctx = NULL;
113
114	assert(id < 12);
115	assert((size & 3) == 0);
116	assert((start & 0xFF) == 0);
117
118	rctx = pipe->ctx;
119
120	COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
121
122	/* Create the RAT surface */
123	memset(&rat_templ, 0, sizeof(rat_templ));
124	rat_templ.format = PIPE_FORMAT_R32_UINT;
125	rat_templ.u.tex.level = 0;
126	rat_templ.u.tex.first_layer = 0;
127	rat_templ.u.tex.last_layer = 0;
128
129	/* Add the RAT the list of color buffers */
130	pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
131		(struct pipe_context *)pipe->ctx,
132		(struct pipe_resource *)bo, &rat_templ);
133
134	/* Update the number of color buffers */
135	pipe->ctx->framebuffer.state.nr_cbufs =
136		MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
137
138	/* Update the cb_target_mask
139	 * XXX: I think this is a potential spot for bugs once we start doing
140	 * GL interop.  cb_target_mask may be modified in the 3D sections
141	 * of this driver. */
142	pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
143
144	surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
145	evergreen_init_color_surface_rat(rctx, surf);
146}
147
148static void evergreen_cs_set_vertex_buffer(
149	struct r600_context * rctx,
150	unsigned vb_index,
151	unsigned offset,
152	struct pipe_resource * buffer)
153{
154	struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
155	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
156	vb->stride = 1;
157	vb->buffer_offset = offset;
158	vb->buffer = buffer;
159	vb->user_buffer = NULL;
160
161	/* The vertex instructions in the compute shaders use the texture cache,
162	 * so we need to invalidate it. */
163	rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
164	state->enabled_mask |= 1 << vb_index;
165	state->dirty_mask |= 1 << vb_index;
166	state->atom.dirty = true;
167}
168
169static void evergreen_cs_set_constant_buffer(
170	struct r600_context * rctx,
171	unsigned cb_index,
172	unsigned offset,
173	unsigned size,
174	struct pipe_resource * buffer)
175{
176	struct pipe_constant_buffer cb;
177	cb.buffer_size = size;
178	cb.buffer_offset = offset;
179	cb.buffer = buffer;
180	cb.user_buffer = NULL;
181
182	rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
183}
184
185static const struct u_resource_vtbl r600_global_buffer_vtbl =
186{
187	u_default_resource_get_handle, /* get_handle */
188	r600_compute_global_buffer_destroy, /* resource_destroy */
189	r600_compute_global_transfer_map, /* transfer_map */
190	r600_compute_global_transfer_flush_region,/* transfer_flush_region */
191	r600_compute_global_transfer_unmap, /* transfer_unmap */
192	r600_compute_global_transfer_inline_write /* transfer_inline_write */
193};
194
195
196void *evergreen_create_compute_state(
197	struct pipe_context *ctx_,
198	const const struct pipe_compute_state *cso)
199{
200	struct r600_context *ctx = (struct r600_context *)ctx_;
201	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
202#ifdef HAVE_OPENCL
203	const struct pipe_llvm_program_header * header;
204	const char *code;
205	void *p;
206	boolean use_kill;
207
208	COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
209	header = cso->prog;
210	code = cso->prog + sizeof(struct pipe_llvm_program_header);
211#if HAVE_LLVM < 0x0306
212        (void)use_kill;
213	(void)p;
214	shader->llvm_ctx = LLVMContextCreate();
215	shader->num_kernels = radeon_llvm_get_num_kernels(shader->llvm_ctx,
216				code, header->num_bytes);
217	shader->kernels = CALLOC(sizeof(struct r600_kernel),
218				shader->num_kernels);
219	{
220		unsigned i;
221		for (i = 0; i < shader->num_kernels; i++) {
222			struct r600_kernel *kernel = &shader->kernels[i];
223			kernel->llvm_module = radeon_llvm_get_kernel_module(
224				shader->llvm_ctx, i, code, header->num_bytes);
225		}
226	}
227#else
228	memset(&shader->binary, 0, sizeof(shader->binary));
229	radeon_elf_read(code, header->num_bytes, &shader->binary, true);
230	r600_create_shader(&shader->bc, &shader->binary, &use_kill);
231
232	shader->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
233							shader->bc.ndw * 4);
234	p = r600_buffer_map_sync_with_rings(&ctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
235	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
236	ctx->b.ws->buffer_unmap(shader->code_bo->cs_buf);
237#endif
238#endif
239
240	shader->ctx = (struct r600_context*)ctx;
241	shader->local_size = cso->req_local_mem;
242	shader->private_size = cso->req_private_mem;
243	shader->input_size = cso->req_input_mem;
244
245	return shader;
246}
247
248void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
249{
250	struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
251
252	if (!shader)
253		return;
254
255	FREE(shader);
256}
257
258static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
259{
260	struct r600_context *ctx = (struct r600_context *)ctx_;
261
262	COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
263
264	ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
265}
266
267/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
268 * kernel parameters there are implicit parameters that need to be stored
269 * in the vertex buffer as well.  Here is how these parameters are organized in
270 * the buffer:
271 *
272 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
273 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
274 * DWORDS 6-8: Number of work items within each work group in each dimension
275 *             (x,y,z)
276 * DWORDS 9+ : Kernel parameters
277 */
278void evergreen_compute_upload_input(
279	struct pipe_context *ctx_,
280	const uint *block_layout,
281	const uint *grid_layout,
282	const void *input)
283{
284	struct r600_context *ctx = (struct r600_context *)ctx_;
285	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
286	unsigned i;
287	/* We need to reserve 9 dwords (36 bytes) for implicit kernel
288	 * parameters.
289	 */
290	unsigned input_size = shader->input_size + 36;
291	uint32_t * num_work_groups_start;
292	uint32_t * global_size_start;
293	uint32_t * local_size_start;
294	uint32_t * kernel_parameters_start;
295	struct pipe_box box;
296	struct pipe_transfer *transfer = NULL;
297
298	if (shader->input_size == 0) {
299		return;
300	}
301
302	if (!shader->kernel_param) {
303		/* Add space for the grid dimensions */
304		shader->kernel_param = (struct r600_resource *)
305			pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
306					PIPE_USAGE_IMMUTABLE, input_size);
307	}
308
309	u_box_1d(0, input_size, &box);
310	num_work_groups_start = ctx_->transfer_map(ctx_,
311			(struct pipe_resource*)shader->kernel_param,
312			0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
313			&box, &transfer);
314	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
315	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
316	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
317
318	/* Copy the work group size */
319	memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
320
321	/* Copy the global size */
322	for (i = 0; i < 3; i++) {
323		global_size_start[i] = grid_layout[i] * block_layout[i];
324	}
325
326	/* Copy the local dimensions */
327	memcpy(local_size_start, block_layout, 3 * sizeof(uint));
328
329	/* Copy the kernel inputs */
330	memcpy(kernel_parameters_start, input, shader->input_size);
331
332	for (i = 0; i < (input_size / 4); i++) {
333		COMPUTE_DBG(ctx->screen, "input %i : %u\n", i,
334			((unsigned*)num_work_groups_start)[i]);
335	}
336
337	ctx_->transfer_unmap(ctx_, transfer);
338
339	/* ID=0 is reserved for the parameters */
340	evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
341			(struct pipe_resource*)shader->kernel_param);
342}
343
344static void evergreen_emit_direct_dispatch(
345		struct r600_context *rctx,
346		const uint *block_layout, const uint *grid_layout)
347{
348	int i;
349	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
350	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
351	unsigned num_waves;
352	unsigned num_pipes = rctx->screen->b.info.r600_max_pipes;
353	unsigned wave_divisor = (16 * num_pipes);
354	int group_size = 1;
355	int grid_size = 1;
356	unsigned lds_size = shader->local_size / 4 +
357#if HAVE_LLVM < 0x0306
358		shader->active_kernel->bc.nlds_dw;
359#else
360		shader->bc.nlds_dw;
361#endif
362
363
364	/* Calculate group_size/grid_size */
365	for (i = 0; i < 3; i++) {
366		group_size *= block_layout[i];
367	}
368
369	for (i = 0; i < 3; i++)	{
370		grid_size *= grid_layout[i];
371	}
372
373	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
374	num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
375			wave_divisor - 1) / wave_divisor;
376
377	COMPUTE_DBG(rctx->screen, "Using %u pipes, "
378				"%u wavefronts per thread block, "
379				"allocating %u dwords lds.\n",
380				num_pipes, num_waves, lds_size);
381
382	r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
383
384	r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
385	radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
386	radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
387	radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
388
389	r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
390								group_size);
391
392	r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
393	radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
394	radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
395	radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
396
397	if (rctx->b.chip_class < CAYMAN) {
398		assert(lds_size <= 8192);
399	} else {
400		/* Cayman appears to have a slightly smaller limit, see the
401		 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
402		assert(lds_size <= 8160);
403	}
404
405	r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC,
406					lds_size | (num_waves << 14));
407
408	/* Dispatch packet */
409	radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
410	radeon_emit(cs, grid_layout[0]);
411	radeon_emit(cs, grid_layout[1]);
412	radeon_emit(cs, grid_layout[2]);
413	/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
414	radeon_emit(cs, 1);
415}
416
417static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
418		const uint *grid_layout)
419{
420	struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
421	unsigned i;
422
423	/* make sure that the gfx ring is only one active */
424	if (ctx->b.rings.dma.cs && ctx->b.rings.dma.cs->cdw) {
425		ctx->b.rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
426	}
427
428	/* Initialize all the compute-related registers.
429	 *
430	 * See evergreen_init_atom_start_compute_cs() in this file for the list
431	 * of registers initialized by the start_compute_cs_cmd atom.
432	 */
433	r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
434
435	ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
436	r600_flush_emit(ctx);
437
438	/* Emit colorbuffers. */
439	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
440	for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
441		struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
442		unsigned reloc = r600_context_bo_reloc(&ctx->b, &ctx->b.rings.gfx,
443						       (struct r600_resource*)cb->base.texture,
444						       RADEON_USAGE_READWRITE,
445						       RADEON_PRIO_SHADER_RESOURCE_RW);
446
447		r600_write_compute_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
448		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
449		radeon_emit(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
450		radeon_emit(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
451		radeon_emit(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
452		radeon_emit(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
453		radeon_emit(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
454		radeon_emit(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */
455
456		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
457		radeon_emit(cs, reloc);
458
459		if (!ctx->keep_tiling_flags) {
460			radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
461			radeon_emit(cs, reloc);
462		}
463
464		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
465		radeon_emit(cs, reloc);
466	}
467	if (ctx->keep_tiling_flags) {
468		for (; i < 8 ; i++) {
469			r600_write_compute_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
470						       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
471		}
472		for (; i < 12; i++) {
473			r600_write_compute_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
474						       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
475		}
476	}
477
478	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
479	r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK,
480					ctx->compute_cb_target_mask);
481
482
483	/* Emit vertex buffer state */
484	ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
485	r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
486
487	/* Emit constant buffer state */
488	r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
489
490	/* Emit compute shader state */
491	r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
492
493	/* Emit dispatch state and dispatch packet */
494	evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
495
496	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
497	 */
498	ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
499		      R600_CONTEXT_INV_VERTEX_CACHE |
500	              R600_CONTEXT_INV_TEX_CACHE;
501	r600_flush_emit(ctx);
502	ctx->b.flags = 0;
503
504	if (ctx->b.chip_class >= CAYMAN) {
505		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
506		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
507		/* DEALLOC_STATE prevents the GPU from hanging when a
508		 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
509		 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
510		 */
511		cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
512		cs->buf[cs->cdw++] = 0;
513	}
514
515#if 0
516	COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
517	for (i = 0; i < cs->cdw; i++) {
518		COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
519	}
520#endif
521
522}
523
524
525/**
526 * Emit function for r600_cs_shader_state atom
527 */
528void evergreen_emit_cs_shader(
529		struct r600_context *rctx,
530		struct r600_atom *atom)
531{
532	struct r600_cs_shader_state *state =
533					(struct r600_cs_shader_state*)atom;
534	struct r600_pipe_compute *shader = state->shader;
535	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
536	uint64_t va;
537	struct r600_resource *code_bo;
538	unsigned ngpr, nstack;
539
540#if HAVE_LLVM < 0x0306
541	struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
542	code_bo = kernel->code_bo;
543	va = kernel->code_bo->gpu_address;
544	ngpr = kernel->bc.ngpr;
545	nstack = kernel->bc.nstack;
546#else
547	code_bo = shader->code_bo;
548	va = shader->code_bo->gpu_address + state->pc;
549	ngpr = shader->bc.ngpr;
550	nstack = shader->bc.nstack;
551#endif
552
553	r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
554	radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
555	radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
556			S_0288D4_NUM_GPRS(ngpr)
557			| S_0288D4_STACK_SIZE(nstack));
558	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
559
560	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
561	radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx,
562					      code_bo, RADEON_USAGE_READ,
563					      RADEON_PRIO_SHADER_DATA));
564}
565
566static void evergreen_launch_grid(
567		struct pipe_context *ctx_,
568		const uint *block_layout, const uint *grid_layout,
569		uint32_t pc, const void *input)
570{
571	struct r600_context *ctx = (struct r600_context *)ctx_;
572#ifdef HAVE_OPENCL
573	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
574	boolean use_kill;
575
576#if HAVE_LLVM < 0x0306
577	struct r600_kernel *kernel = &shader->kernels[pc];
578	(void)use_kill;
579        if (!kernel->code_bo) {
580                void *p;
581                struct r600_bytecode *bc = &kernel->bc;
582                LLVMModuleRef mod = kernel->llvm_module;
583                boolean use_kill = false;
584                bool dump = (ctx->screen->b.debug_flags & DBG_CS) != 0;
585                unsigned use_sb = ctx->screen->b.debug_flags & DBG_SB_CS;
586                unsigned sb_disasm = use_sb ||
587                        (ctx->screen->b.debug_flags & DBG_SB_DISASM);
588
589                r600_bytecode_init(bc, ctx->b.chip_class, ctx->b.family,
590                           ctx->screen->has_compressed_msaa_texturing);
591                bc->type = TGSI_PROCESSOR_COMPUTE;
592                bc->isa = ctx->isa;
593                r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump);
594
595                if (dump && !sb_disasm) {
596                        r600_bytecode_disasm(bc);
597                } else if ((dump && sb_disasm) || use_sb) {
598                        if (r600_sb_bytecode_process(ctx, bc, NULL, dump, use_sb))
599                                R600_ERR("r600_sb_bytecode_process failed!\n");
600                }
601
602                kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
603                                                        kernel->bc.ndw * 4);
604                p = r600_buffer_map_sync_with_rings(&ctx->b, kernel->code_bo, PIPE_TRANSFER_WRITE);
605                memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
606                ctx->b.ws->buffer_unmap(kernel->code_bo->cs_buf);
607        }
608	shader->active_kernel = kernel;
609	ctx->cs_shader_state.kernel_index = pc;
610#else
611	ctx->cs_shader_state.pc = pc;
612	/* Get the config information for this kernel. */
613	r600_shader_binary_read_config(&shader->binary, &shader->bc, pc, &use_kill);
614#endif
615#endif
616
617	COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);
618
619
620	evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
621	compute_emit_cs(ctx, block_layout, grid_layout);
622}
623
624static void evergreen_set_compute_resources(struct pipe_context * ctx_,
625		unsigned start, unsigned count,
626		struct pipe_surface ** surfaces)
627{
628	struct r600_context *ctx = (struct r600_context *)ctx_;
629	struct r600_surface **resources = (struct r600_surface **)surfaces;
630
631	COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
632			start, count);
633
634	for (unsigned i = 0; i < count; i++) {
635		/* The First two vertex buffers are reserved for parameters and
636		 * global buffers. */
637		unsigned vtx_id = 2 + i;
638		if (resources[i]) {
639			struct r600_resource_global *buffer =
640				(struct r600_resource_global*)
641				resources[i]->base.texture;
642			if (resources[i]->base.writable) {
643				assert(i+1 < 12);
644
645				evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
646				(struct r600_resource *)resources[i]->base.texture,
647				buffer->chunk->start_in_dw*4,
648				resources[i]->base.texture->width0);
649			}
650
651			evergreen_cs_set_vertex_buffer(ctx, vtx_id,
652					buffer->chunk->start_in_dw * 4,
653					resources[i]->base.texture);
654		}
655	}
656}
657
658void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
659		unsigned start_slot, unsigned count,
660		struct pipe_sampler_view **views)
661{
662	struct r600_pipe_sampler_view **resource =
663		(struct r600_pipe_sampler_view **)views;
664
665	for (unsigned i = 0; i < count; i++)	{
666		if (resource[i]) {
667			assert(i+1 < 12);
668			/* XXX: Implement */
669			assert(!"Compute samplers not implemented.");
670			///FETCH0 = VTX0 (param buffer),
671			//FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
672		}
673	}
674}
675
676
677static void evergreen_set_global_binding(
678	struct pipe_context *ctx_, unsigned first, unsigned n,
679	struct pipe_resource **resources,
680	uint32_t **handles)
681{
682	struct r600_context *ctx = (struct r600_context *)ctx_;
683	struct compute_memory_pool *pool = ctx->screen->global_pool;
684	struct r600_resource_global **buffers =
685		(struct r600_resource_global **)resources;
686	unsigned i;
687
688	COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
689			first, n);
690
691	if (!resources) {
692		/* XXX: Unset */
693		return;
694	}
695
696	/* We mark these items for promotion to the pool if they
697	 * aren't already there */
698	for (i = first; i < first + n; i++) {
699		struct compute_memory_item *item = buffers[i]->chunk;
700
701		if (!is_item_in_pool(item))
702			buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
703	}
704
705	if (compute_memory_finalize_pending(pool, ctx_) == -1) {
706		/* XXX: Unset */
707		return;
708	}
709
710	for (i = first; i < first + n; i++)
711	{
712		uint32_t buffer_offset;
713		uint32_t handle;
714		assert(resources[i]->target == PIPE_BUFFER);
715		assert(resources[i]->bind & PIPE_BIND_GLOBAL);
716
717		buffer_offset = util_le32_to_cpu(*(handles[i]));
718		handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
719
720		*(handles[i]) = util_cpu_to_le32(handle);
721	}
722
723	evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
724	evergreen_cs_set_vertex_buffer(ctx, 1, 0,
725				(struct pipe_resource*)pool->bo);
726}
727
728/**
729 * This function initializes all the compute specific registers that need to
730 * be initialized for each compute command stream.  Registers that are common
731 * to both compute and 3D will be initialized at the beginning of each compute
732 * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
733 * packet requires that the shader type bit be set, we must initialize all
734 * context registers needed for compute in this function.  The registers
735 * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
736 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
737 * on the GPU family.
738 */
739void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
740{
741	struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
742	int num_threads;
743	int num_stack_entries;
744
745	/* since all required registers are initialised in the
746	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
747	 */
748	r600_init_command_buffer(cb, 256);
749	cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
750
751	/* This must be first. */
752	r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
753	r600_store_value(cb, 0x80000000);
754	r600_store_value(cb, 0x80000000);
755
756	/* We're setting config registers here. */
757	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
758	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
759
760	switch (ctx->b.family) {
761	case CHIP_CEDAR:
762	default:
763		num_threads = 128;
764		num_stack_entries = 256;
765		break;
766	case CHIP_REDWOOD:
767		num_threads = 128;
768		num_stack_entries = 256;
769		break;
770	case CHIP_JUNIPER:
771		num_threads = 128;
772		num_stack_entries = 512;
773		break;
774	case CHIP_CYPRESS:
775	case CHIP_HEMLOCK:
776		num_threads = 128;
777		num_stack_entries = 512;
778		break;
779	case CHIP_PALM:
780		num_threads = 128;
781		num_stack_entries = 256;
782		break;
783	case CHIP_SUMO:
784		num_threads = 128;
785		num_stack_entries = 256;
786		break;
787	case CHIP_SUMO2:
788		num_threads = 128;
789		num_stack_entries = 512;
790		break;
791	case CHIP_BARTS:
792		num_threads = 128;
793		num_stack_entries = 512;
794		break;
795	case CHIP_TURKS:
796		num_threads = 128;
797		num_stack_entries = 256;
798		break;
799	case CHIP_CAICOS:
800		num_threads = 128;
801		num_stack_entries = 256;
802		break;
803	}
804
805	/* Config Registers */
806	if (ctx->b.chip_class < CAYMAN)
807		evergreen_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
808					   ctx->screen->b.info.drm_minor);
809	else
810		cayman_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
811					ctx->screen->b.info.drm_minor);
812
813	/* The primitive type always needs to be POINTLIST for compute. */
814	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
815						V_008958_DI_PT_POINTLIST);
816
817	if (ctx->b.chip_class < CAYMAN) {
818
819		/* These registers control which simds can be used by each stage.
820		 * The default for these registers is 0xffffffff, which means
821		 * all simds are available for each stage.  It's possible we may
822		 * want to play around with these in the future, but for now
823		 * the default value is fine.
824		 *
825		 * R_008E20_SQ_STATIC_THREAD_MGMT1
826		 * R_008E24_SQ_STATIC_THREAD_MGMT2
827		 * R_008E28_SQ_STATIC_THREAD_MGMT3
828		 */
829
830		/* XXX: We may need to adjust the thread and stack resouce
831		 * values for 3D/compute interop */
832
833		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
834
835		/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
836		 * Set the number of threads used by the PS/VS/GS/ES stage to
837		 * 0.
838		 */
839		r600_store_value(cb, 0);
840
841		/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
842		 * Set the number of threads used by the CS (aka LS) stage to
843		 * the maximum number of threads and set the number of threads
844		 * for the HS stage to 0. */
845		r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
846
847		/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
848		 * Set the Control Flow stack entries to 0 for PS/VS stages */
849		r600_store_value(cb, 0);
850
851		/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
852		 * Set the Control Flow stack entries to 0 for GS/ES stages */
853		r600_store_value(cb, 0);
854
855		/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
856		 * Set the Contol Flow stack entries to 0 for the HS stage, and
857		 * set it to the maximum value for the CS (aka LS) stage. */
858		r600_store_value(cb,
859			S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
860	}
861	/* Give the compute shader all the available LDS space.
862	 * NOTE: This only sets the maximum number of dwords that a compute
863	 * shader can allocate.  When a shader is executed, we still need to
864	 * allocate the appropriate amount of LDS dwords using the
865	 * CM_R_0288E8_SQ_LDS_ALLOC register.
866	 */
867	if (ctx->b.chip_class < CAYMAN) {
868		r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
869			S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
870	} else {
871		r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
872			S_0286FC_NUM_PS_LDS(0) |
873			S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
874	}
875
876	/* Context Registers */
877
878	if (ctx->b.chip_class < CAYMAN) {
879		/* workaround for hw issues with dyn gpr - must set all limits
880		 * to 240 instead of 0, 0x1e == 240 / 8
881		 */
882		r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
883				S_028838_PS_GPRS(0x1e) |
884				S_028838_VS_GPRS(0x1e) |
885				S_028838_GS_GPRS(0x1e) |
886				S_028838_ES_GPRS(0x1e) |
887				S_028838_HS_GPRS(0x1e) |
888				S_028838_LS_GPRS(0x1e));
889	}
890
891	/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
892	r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
893		S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
894
895	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
896
897	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
898						S_0286E8_TID_IN_GROUP_ENA
899						| S_0286E8_TGID_ENA
900						| S_0286E8_DISABLE_INDEX_PACK)
901						;
902
903	/* The LOOP_CONST registers are an optimizations for loops that allows
904	 * you to store the initial counter, increment value, and maximum
905	 * counter value in a register so that hardware can calculate the
906	 * correct number of iterations for the loop, so that you don't need
907	 * to have the loop counter in your shader code.  We don't currently use
908	 * this optimization, so we must keep track of the counter in the
909	 * shader and use a break instruction to exit loops.  However, the
910	 * hardware will still uses this register to determine when to exit a
911	 * loop, so we need to initialize the counter to 0, set the increment
912	 * value to 1 and the maximum counter value to the 4095 (0xfff) which
913	 * is the maximum value allowed.  This gives us a maximum of 4096
914	 * iterations for our loops, but hopefully our break instruction will
915	 * execute before some time before the 4096th iteration.
916	 */
917	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
918}
919
920void evergreen_init_compute_state_functions(struct r600_context *ctx)
921{
922	ctx->b.b.create_compute_state = evergreen_create_compute_state;
923	ctx->b.b.delete_compute_state = evergreen_delete_compute_state;
924	ctx->b.b.bind_compute_state = evergreen_bind_compute_state;
925//	 ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
926	ctx->b.b.set_compute_resources = evergreen_set_compute_resources;
927	ctx->b.b.set_global_binding = evergreen_set_global_binding;
928	ctx->b.b.launch_grid = evergreen_launch_grid;
929
930}
931
932struct pipe_resource *r600_compute_global_buffer_create(
933	struct pipe_screen *screen,
934	const struct pipe_resource *templ)
935{
936	struct r600_resource_global* result = NULL;
937	struct r600_screen* rscreen = NULL;
938	int size_in_dw = 0;
939
940	assert(templ->target == PIPE_BUFFER);
941	assert(templ->bind & PIPE_BIND_GLOBAL);
942	assert(templ->array_size == 1 || templ->array_size == 0);
943	assert(templ->depth0 == 1 || templ->depth0 == 0);
944	assert(templ->height0 == 1 || templ->height0 == 0);
945
946	result = (struct r600_resource_global*)
947	CALLOC(sizeof(struct r600_resource_global), 1);
948	rscreen = (struct r600_screen*)screen;
949
950	COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
951	COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
952			templ->array_size);
953
954	result->base.b.vtbl = &r600_global_buffer_vtbl;
955	result->base.b.b.screen = screen;
956	result->base.b.b = *templ;
957	pipe_reference_init(&result->base.b.b.reference, 1);
958
959	size_in_dw = (templ->width0+3) / 4;
960
961	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
962
963	if (result->chunk == NULL)
964	{
965		free(result);
966		return NULL;
967	}
968
969	return &result->base.b.b;
970}
971
972void r600_compute_global_buffer_destroy(
973	struct pipe_screen *screen,
974	struct pipe_resource *res)
975{
976	struct r600_resource_global* buffer = NULL;
977	struct r600_screen* rscreen = NULL;
978
979	assert(res->target == PIPE_BUFFER);
980	assert(res->bind & PIPE_BIND_GLOBAL);
981
982	buffer = (struct r600_resource_global*)res;
983	rscreen = (struct r600_screen*)screen;
984
985	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
986
987	buffer->chunk = NULL;
988	free(res);
989}
990
991void *r600_compute_global_transfer_map(
992	struct pipe_context *ctx_,
993	struct pipe_resource *resource,
994	unsigned level,
995	unsigned usage,
996	const struct pipe_box *box,
997	struct pipe_transfer **ptransfer)
998{
999	struct r600_context *rctx = (struct r600_context*)ctx_;
1000	struct compute_memory_pool *pool = rctx->screen->global_pool;
1001	struct r600_resource_global* buffer =
1002		(struct r600_resource_global*)resource;
1003
1004	struct compute_memory_item *item = buffer->chunk;
1005	struct pipe_resource *dst = NULL;
1006	unsigned offset = box->x;
1007
1008	if (is_item_in_pool(item)) {
1009		compute_memory_demote_item(pool, item, ctx_);
1010	}
1011	else {
1012		if (item->real_buffer == NULL) {
1013			item->real_buffer = (struct r600_resource*)
1014					r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1015		}
1016	}
1017
1018	dst = (struct pipe_resource*)item->real_buffer;
1019
1020	if (usage & PIPE_TRANSFER_READ)
1021		buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1022
1023	COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1024			"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1025			"width = %u, height = %u, depth = %u)\n", level, usage,
1026			box->x, box->y, box->z, box->width, box->height,
1027			box->depth);
1028	COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1029		"%u (box.x)\n", item->id, box->x);
1030
1031
1032	assert(resource->target == PIPE_BUFFER);
1033	assert(resource->bind & PIPE_BIND_GLOBAL);
1034	assert(box->x >= 0);
1035	assert(box->y == 0);
1036	assert(box->z == 0);
1037
1038	///TODO: do it better, mapping is not possible if the pool is too big
1039	return pipe_buffer_map_range(ctx_, dst,
1040			offset, box->width, usage, ptransfer);
1041}
1042
1043void r600_compute_global_transfer_unmap(
1044	struct pipe_context *ctx_,
1045	struct pipe_transfer* transfer)
1046{
1047	/* struct r600_resource_global are not real resources, they just map
1048	 * to an offset within the compute memory pool.  The function
1049	 * r600_compute_global_transfer_map() maps the memory pool
1050	 * resource rather than the struct r600_resource_global passed to
1051	 * it as an argument and then initalizes ptransfer->resource with
1052	 * the memory pool resource (via pipe_buffer_map_range).
1053	 * When transfer_unmap is called it uses the memory pool's
1054	 * vtable which calls r600_buffer_transfer_map() rather than
1055	 * this function.
1056	 */
1057	assert (!"This function should not be called");
1058}
1059
1060void r600_compute_global_transfer_flush_region(
1061	struct pipe_context *ctx_,
1062	struct pipe_transfer *transfer,
1063	const struct pipe_box *box)
1064{
1065	assert(0 && "TODO");
1066}
1067
1068void r600_compute_global_transfer_inline_write(
1069	struct pipe_context *pipe,
1070	struct pipe_resource *resource,
1071	unsigned level,
1072	unsigned usage,
1073	const struct pipe_box *box,
1074	const void *data,
1075	unsigned stride,
1076	unsigned layer_stride)
1077{
1078	assert(0 && "TODO");
1079}
1080