evergreen_compute.c revision 6cc8f6c6a72b1aab7bb506deb220e04ae50d8c2b
1/*
2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 *      Adam Rak <adam.rak@streamnovation.com>
25 */
26
27#include <stdio.h>
28#include <errno.h>
29#include "pipe/p_defines.h"
30#include "pipe/p_state.h"
31#include "pipe/p_context.h"
32#include "util/u_blitter.h"
33#include "util/list.h"
34#include "util/u_transfer.h"
35#include "util/u_surface.h"
36#include "util/u_pack_color.h"
37#include "util/u_memory.h"
38#include "util/u_inlines.h"
39#include "util/u_framebuffer.h"
40#include "pipebuffer/pb_buffer.h"
41#include "evergreend.h"
42#include "r600_shader.h"
43#include "r600_pipe.h"
44#include "r600_formats.h"
45#include "evergreen_compute.h"
46#include "evergreen_compute_internal.h"
47#include "compute_memory_pool.h"
48#include "sb/sb_public.h"
49#ifdef HAVE_OPENCL
50#include "radeon/radeon_llvm_util.h"
51#endif
52#include "radeon/radeon_elf_util.h"
53#include <inttypes.h>
54
55/**
56RAT0 is for global binding write
57VTX1 is for global binding read
58
59for wrting images RAT1...
60for reading images TEX2...
61  TEX2-RAT1 is paired
62
63TEX2... consumes the same fetch resources, that VTX2... would consume
64
65CONST0 and VTX0 is for parameters
66  CONST0 is binding smaller input parameter buffer, and for constant indexing,
67  also constant cached
68  VTX0 is for indirect/non-constant indexing, or if the input is bigger than
69  the constant cache can handle
70
71RAT-s are limited to 12, so we can only bind at most 11 texture for writing
72because we reserve RAT0 for global bindings. With byteaddressing enabled,
73we should reserve another one too.=> 10 image binding for writing max.
74
75from Nvidia OpenCL:
76  CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
77  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
78
79so 10 for writing is enough. 176 is the max for reading according to the docs
80
81writable images should be listed first < 10, so their id corresponds to RAT(id+1)
82writable images will consume TEX slots, VTX slots too because of linear indexing
83
84*/
85
86struct r600_resource* r600_compute_buffer_alloc_vram(
87       struct r600_screen *screen,
88       unsigned size)
89{
90	struct pipe_resource * buffer = NULL;
91	assert(size);
92
93	buffer = pipe_buffer_create(
94		(struct pipe_screen*) screen,
95		PIPE_BIND_CUSTOM,
96		PIPE_USAGE_IMMUTABLE,
97		size);
98
99	return (struct r600_resource *)buffer;
100}
101
102
103static void evergreen_set_rat(
104	struct r600_pipe_compute *pipe,
105	unsigned id,
106	struct r600_resource* bo,
107	int start,
108	int size)
109{
110	struct pipe_surface rat_templ;
111	struct r600_surface *surf = NULL;
112	struct r600_context *rctx = NULL;
113
114	assert(id < 12);
115	assert((size & 3) == 0);
116	assert((start & 0xFF) == 0);
117
118	rctx = pipe->ctx;
119
120	COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
121
122	/* Create the RAT surface */
123	memset(&rat_templ, 0, sizeof(rat_templ));
124	rat_templ.format = PIPE_FORMAT_R32_UINT;
125	rat_templ.u.tex.level = 0;
126	rat_templ.u.tex.first_layer = 0;
127	rat_templ.u.tex.last_layer = 0;
128
129	/* Add the RAT the list of color buffers */
130	pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
131		(struct pipe_context *)pipe->ctx,
132		(struct pipe_resource *)bo, &rat_templ);
133
134	/* Update the number of color buffers */
135	pipe->ctx->framebuffer.state.nr_cbufs =
136		MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
137
138	/* Update the cb_target_mask
139	 * XXX: I think this is a potential spot for bugs once we start doing
140	 * GL interop.  cb_target_mask may be modified in the 3D sections
141	 * of this driver. */
142	pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
143
144	surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
145	evergreen_init_color_surface_rat(rctx, surf);
146}
147
148static void evergreen_cs_set_vertex_buffer(
149	struct r600_context * rctx,
150	unsigned vb_index,
151	unsigned offset,
152	struct pipe_resource * buffer)
153{
154	struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
155	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
156	vb->stride = 1;
157	vb->buffer_offset = offset;
158	vb->buffer = buffer;
159	vb->user_buffer = NULL;
160
161	/* The vertex instructions in the compute shaders use the texture cache,
162	 * so we need to invalidate it. */
163	rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
164	state->enabled_mask |= 1 << vb_index;
165	state->dirty_mask |= 1 << vb_index;
166	r600_mark_atom_dirty(rctx, &state->atom);
167}
168
169static void evergreen_cs_set_constant_buffer(
170	struct r600_context * rctx,
171	unsigned cb_index,
172	unsigned offset,
173	unsigned size,
174	struct pipe_resource * buffer)
175{
176	struct pipe_constant_buffer cb;
177	cb.buffer_size = size;
178	cb.buffer_offset = offset;
179	cb.buffer = buffer;
180	cb.user_buffer = NULL;
181
182	rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
183}
184
185static const struct u_resource_vtbl r600_global_buffer_vtbl =
186{
187	u_default_resource_get_handle, /* get_handle */
188	r600_compute_global_buffer_destroy, /* resource_destroy */
189	r600_compute_global_transfer_map, /* transfer_map */
190	r600_compute_global_transfer_flush_region,/* transfer_flush_region */
191	r600_compute_global_transfer_unmap, /* transfer_unmap */
192	r600_compute_global_transfer_inline_write /* transfer_inline_write */
193};
194
195
196void *evergreen_create_compute_state(
197	struct pipe_context *ctx_,
198	const const struct pipe_compute_state *cso)
199{
200	struct r600_context *ctx = (struct r600_context *)ctx_;
201	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
202#ifdef HAVE_OPENCL
203	const struct pipe_llvm_program_header * header;
204	const char *code;
205	void *p;
206	boolean use_kill;
207
208	COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
209	header = cso->prog;
210	code = cso->prog + sizeof(struct pipe_llvm_program_header);
211#if HAVE_LLVM < 0x0306
212        (void)use_kill;
213	(void)p;
214	shader->llvm_ctx = LLVMContextCreate();
215	shader->num_kernels = radeon_llvm_get_num_kernels(shader->llvm_ctx,
216				code, header->num_bytes);
217	shader->kernels = CALLOC(sizeof(struct r600_kernel),
218				shader->num_kernels);
219	{
220		unsigned i;
221		for (i = 0; i < shader->num_kernels; i++) {
222			struct r600_kernel *kernel = &shader->kernels[i];
223			kernel->llvm_module = radeon_llvm_get_kernel_module(
224				shader->llvm_ctx, i, code, header->num_bytes);
225		}
226	}
227#else
228	memset(&shader->binary, 0, sizeof(shader->binary));
229	radeon_elf_read(code, header->num_bytes, &shader->binary);
230	r600_create_shader(&shader->bc, &shader->binary, &use_kill);
231
232	shader->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
233							shader->bc.ndw * 4);
234	p = r600_buffer_map_sync_with_rings(&ctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
235	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
236	ctx->b.ws->buffer_unmap(shader->code_bo->cs_buf);
237#endif
238#endif
239
240	shader->ctx = (struct r600_context*)ctx;
241	shader->local_size = cso->req_local_mem;
242	shader->private_size = cso->req_private_mem;
243	shader->input_size = cso->req_input_mem;
244
245	return shader;
246}
247
248void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
249{
250	struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
251
252	if (!shader)
253		return;
254
255	FREE(shader);
256}
257
258static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
259{
260	struct r600_context *ctx = (struct r600_context *)ctx_;
261
262	COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
263
264	ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
265}
266
267/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
268 * kernel parameters there are implicit parameters that need to be stored
269 * in the vertex buffer as well.  Here is how these parameters are organized in
270 * the buffer:
271 *
272 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
273 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
274 * DWORDS 6-8: Number of work items within each work group in each dimension
275 *             (x,y,z)
276 * DWORDS 9+ : Kernel parameters
277 */
278void evergreen_compute_upload_input(
279	struct pipe_context *ctx_,
280	const uint *block_layout,
281	const uint *grid_layout,
282	const void *input)
283{
284	struct r600_context *ctx = (struct r600_context *)ctx_;
285	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
286	unsigned i;
287	/* We need to reserve 9 dwords (36 bytes) for implicit kernel
288	 * parameters.
289	 */
290	unsigned input_size = shader->input_size + 36;
291	uint32_t * num_work_groups_start;
292	uint32_t * global_size_start;
293	uint32_t * local_size_start;
294	uint32_t * kernel_parameters_start;
295	struct pipe_box box;
296	struct pipe_transfer *transfer = NULL;
297
298	if (shader->input_size == 0) {
299		return;
300	}
301
302	if (!shader->kernel_param) {
303		/* Add space for the grid dimensions */
304		shader->kernel_param = (struct r600_resource *)
305			pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
306					PIPE_USAGE_IMMUTABLE, input_size);
307	}
308
309	u_box_1d(0, input_size, &box);
310	num_work_groups_start = ctx_->transfer_map(ctx_,
311			(struct pipe_resource*)shader->kernel_param,
312			0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
313			&box, &transfer);
314	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
315	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
316	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
317
318	/* Copy the work group size */
319	memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
320
321	/* Copy the global size */
322	for (i = 0; i < 3; i++) {
323		global_size_start[i] = grid_layout[i] * block_layout[i];
324	}
325
326	/* Copy the local dimensions */
327	memcpy(local_size_start, block_layout, 3 * sizeof(uint));
328
329	/* Copy the kernel inputs */
330	memcpy(kernel_parameters_start, input, shader->input_size);
331
332	for (i = 0; i < (input_size / 4); i++) {
333		COMPUTE_DBG(ctx->screen, "input %i : %u\n", i,
334			((unsigned*)num_work_groups_start)[i]);
335	}
336
337	ctx_->transfer_unmap(ctx_, transfer);
338
339	/* ID=0 is reserved for the parameters */
340	evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
341			(struct pipe_resource*)shader->kernel_param);
342}
343
344static void evergreen_emit_direct_dispatch(
345		struct r600_context *rctx,
346		const uint *block_layout, const uint *grid_layout)
347{
348	int i;
349	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
350	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
351	unsigned num_waves;
352	unsigned num_pipes = rctx->screen->b.info.r600_max_pipes;
353	unsigned wave_divisor = (16 * num_pipes);
354	int group_size = 1;
355	int grid_size = 1;
356	unsigned lds_size = shader->local_size / 4 +
357#if HAVE_LLVM < 0x0306
358		shader->active_kernel->bc.nlds_dw;
359#else
360		shader->bc.nlds_dw;
361#endif
362
363
364	/* Calculate group_size/grid_size */
365	for (i = 0; i < 3; i++) {
366		group_size *= block_layout[i];
367	}
368
369	for (i = 0; i < 3; i++)	{
370		grid_size *= grid_layout[i];
371	}
372
373	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
374	num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
375			wave_divisor - 1) / wave_divisor;
376
377	COMPUTE_DBG(rctx->screen, "Using %u pipes, "
378				"%u wavefronts per thread block, "
379				"allocating %u dwords lds.\n",
380				num_pipes, num_waves, lds_size);
381
382	radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
383
384	radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
385	radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
386	radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
387	radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
388
389	radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
390								group_size);
391
392	radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
393	radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
394	radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
395	radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
396
397	if (rctx->b.chip_class < CAYMAN) {
398		assert(lds_size <= 8192);
399	} else {
400		/* Cayman appears to have a slightly smaller limit, see the
401		 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
402		assert(lds_size <= 8160);
403	}
404
405	radeon_compute_set_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC,
406					lds_size | (num_waves << 14));
407
408	/* Dispatch packet */
409	radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
410	radeon_emit(cs, grid_layout[0]);
411	radeon_emit(cs, grid_layout[1]);
412	radeon_emit(cs, grid_layout[2]);
413	/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
414	radeon_emit(cs, 1);
415}
416
417static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
418		const uint *grid_layout)
419{
420	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
421	unsigned i;
422
423	/* make sure that the gfx ring is only one active */
424	if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) {
425		ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
426	}
427
428	/* Initialize all the compute-related registers.
429	 *
430	 * See evergreen_init_atom_start_compute_cs() in this file for the list
431	 * of registers initialized by the start_compute_cs_cmd atom.
432	 */
433	r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
434
435	ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
436	r600_flush_emit(ctx);
437
438	/* Emit colorbuffers. */
439	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
440	for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
441		struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
442		unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx,
443						       (struct r600_resource*)cb->base.texture,
444						       RADEON_USAGE_READWRITE,
445						       RADEON_PRIO_SHADER_RW_BUFFER);
446
447		radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
448		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
449		radeon_emit(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
450		radeon_emit(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
451		radeon_emit(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
452		radeon_emit(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
453		radeon_emit(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
454		radeon_emit(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */
455
456		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
457		radeon_emit(cs, reloc);
458
459		if (!ctx->keep_tiling_flags) {
460			radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
461			radeon_emit(cs, reloc);
462		}
463
464		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
465		radeon_emit(cs, reloc);
466	}
467	if (ctx->keep_tiling_flags) {
468		for (; i < 8 ; i++) {
469			radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
470						       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
471		}
472		for (; i < 12; i++) {
473			radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
474						       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
475		}
476	}
477
478	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
479	radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
480					ctx->compute_cb_target_mask);
481
482
483	/* Emit vertex buffer state */
484	ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
485	r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
486
487	/* Emit constant buffer state */
488	r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
489
490	/* Emit sampler state */
491	r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
492
493	/* Emit sampler view (texture resource) state */
494	r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
495
496	/* Emit compute shader state */
497	r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
498
499	/* Emit dispatch state and dispatch packet */
500	evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
501
502	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
503	 */
504	ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
505		      R600_CONTEXT_INV_VERTEX_CACHE |
506	              R600_CONTEXT_INV_TEX_CACHE;
507	r600_flush_emit(ctx);
508	ctx->b.flags = 0;
509
510	if (ctx->b.chip_class >= CAYMAN) {
511		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
512		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
513		/* DEALLOC_STATE prevents the GPU from hanging when a
514		 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
515		 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
516		 */
517		cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
518		cs->buf[cs->cdw++] = 0;
519	}
520
521#if 0
522	COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
523	for (i = 0; i < cs->cdw; i++) {
524		COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
525	}
526#endif
527
528}
529
530
531/**
532 * Emit function for r600_cs_shader_state atom
533 */
534void evergreen_emit_cs_shader(
535		struct r600_context *rctx,
536		struct r600_atom *atom)
537{
538	struct r600_cs_shader_state *state =
539					(struct r600_cs_shader_state*)atom;
540	struct r600_pipe_compute *shader = state->shader;
541	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
542	uint64_t va;
543	struct r600_resource *code_bo;
544	unsigned ngpr, nstack;
545
546#if HAVE_LLVM < 0x0306
547	struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
548	code_bo = kernel->code_bo;
549	va = kernel->code_bo->gpu_address;
550	ngpr = kernel->bc.ngpr;
551	nstack = kernel->bc.nstack;
552#else
553	code_bo = shader->code_bo;
554	va = shader->code_bo->gpu_address + state->pc;
555	ngpr = shader->bc.ngpr;
556	nstack = shader->bc.nstack;
557#endif
558
559	radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
560	radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
561	radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
562			S_0288D4_NUM_GPRS(ngpr)
563			| S_0288D4_STACK_SIZE(nstack));
564	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
565
566	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
567	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
568					      code_bo, RADEON_USAGE_READ,
569					      RADEON_PRIO_USER_SHADER));
570}
571
572static void evergreen_launch_grid(
573		struct pipe_context *ctx_,
574		const uint *block_layout, const uint *grid_layout,
575		uint32_t pc, const void *input)
576{
577	struct r600_context *ctx = (struct r600_context *)ctx_;
578#ifdef HAVE_OPENCL
579	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
580	boolean use_kill;
581
582#if HAVE_LLVM < 0x0306
583	struct r600_kernel *kernel = &shader->kernels[pc];
584	(void)use_kill;
585        if (!kernel->code_bo) {
586                void *p;
587                struct r600_bytecode *bc = &kernel->bc;
588                LLVMModuleRef mod = kernel->llvm_module;
589                boolean use_kill = false;
590                bool dump = (ctx->screen->b.debug_flags & DBG_CS) != 0;
591                unsigned use_sb = ctx->screen->b.debug_flags & DBG_SB_CS;
592                unsigned sb_disasm = use_sb ||
593                        (ctx->screen->b.debug_flags & DBG_SB_DISASM);
594
595                r600_bytecode_init(bc, ctx->b.chip_class, ctx->b.family,
596                           ctx->screen->has_compressed_msaa_texturing);
597                bc->type = TGSI_PROCESSOR_COMPUTE;
598                bc->isa = ctx->isa;
599                r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump);
600
601                if (dump && !sb_disasm) {
602                        r600_bytecode_disasm(bc);
603                } else if ((dump && sb_disasm) || use_sb) {
604                        if (r600_sb_bytecode_process(ctx, bc, NULL, dump, use_sb))
605                                R600_ERR("r600_sb_bytecode_process failed!\n");
606                }
607
608                kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
609                                                        kernel->bc.ndw * 4);
610                p = r600_buffer_map_sync_with_rings(&ctx->b, kernel->code_bo, PIPE_TRANSFER_WRITE);
611                memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
612                ctx->b.ws->buffer_unmap(kernel->code_bo->cs_buf);
613        }
614	shader->active_kernel = kernel;
615	ctx->cs_shader_state.kernel_index = pc;
616#else
617	ctx->cs_shader_state.pc = pc;
618	/* Get the config information for this kernel. */
619	r600_shader_binary_read_config(&shader->binary, &shader->bc, pc, &use_kill);
620#endif
621#endif
622
623	COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);
624
625
626	evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
627	compute_emit_cs(ctx, block_layout, grid_layout);
628}
629
630static void evergreen_set_compute_resources(struct pipe_context * ctx_,
631		unsigned start, unsigned count,
632		struct pipe_surface ** surfaces)
633{
634	struct r600_context *ctx = (struct r600_context *)ctx_;
635	struct r600_surface **resources = (struct r600_surface **)surfaces;
636
637	COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
638			start, count);
639
640	for (unsigned i = 0; i < count; i++) {
641		/* The First two vertex buffers are reserved for parameters and
642		 * global buffers. */
643		unsigned vtx_id = 2 + i;
644		if (resources[i]) {
645			struct r600_resource_global *buffer =
646				(struct r600_resource_global*)
647				resources[i]->base.texture;
648			if (resources[i]->base.writable) {
649				assert(i+1 < 12);
650
651				evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
652				(struct r600_resource *)resources[i]->base.texture,
653				buffer->chunk->start_in_dw*4,
654				resources[i]->base.texture->width0);
655			}
656
657			evergreen_cs_set_vertex_buffer(ctx, vtx_id,
658					buffer->chunk->start_in_dw * 4,
659					resources[i]->base.texture);
660		}
661	}
662}
663
664static void evergreen_set_global_binding(
665	struct pipe_context *ctx_, unsigned first, unsigned n,
666	struct pipe_resource **resources,
667	uint32_t **handles)
668{
669	struct r600_context *ctx = (struct r600_context *)ctx_;
670	struct compute_memory_pool *pool = ctx->screen->global_pool;
671	struct r600_resource_global **buffers =
672		(struct r600_resource_global **)resources;
673	unsigned i;
674
675	COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
676			first, n);
677
678	if (!resources) {
679		/* XXX: Unset */
680		return;
681	}
682
683	/* We mark these items for promotion to the pool if they
684	 * aren't already there */
685	for (i = first; i < first + n; i++) {
686		struct compute_memory_item *item = buffers[i]->chunk;
687
688		if (!is_item_in_pool(item))
689			buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
690	}
691
692	if (compute_memory_finalize_pending(pool, ctx_) == -1) {
693		/* XXX: Unset */
694		return;
695	}
696
697	for (i = first; i < first + n; i++)
698	{
699		uint32_t buffer_offset;
700		uint32_t handle;
701		assert(resources[i]->target == PIPE_BUFFER);
702		assert(resources[i]->bind & PIPE_BIND_GLOBAL);
703
704		buffer_offset = util_le32_to_cpu(*(handles[i]));
705		handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
706
707		*(handles[i]) = util_cpu_to_le32(handle);
708	}
709
710	evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
711	evergreen_cs_set_vertex_buffer(ctx, 1, 0,
712				(struct pipe_resource*)pool->bo);
713}
714
715/**
716 * This function initializes all the compute specific registers that need to
717 * be initialized for each compute command stream.  Registers that are common
718 * to both compute and 3D will be initialized at the beginning of each compute
719 * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
720 * packet requires that the shader type bit be set, we must initialize all
721 * context registers needed for compute in this function.  The registers
722 * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
723 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
724 * on the GPU family.
725 */
726void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
727{
728	struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
729	int num_threads;
730	int num_stack_entries;
731
732	/* since all required registers are initialised in the
733	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
734	 */
735	r600_init_command_buffer(cb, 256);
736	cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
737
738	/* This must be first. */
739	r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
740	r600_store_value(cb, 0x80000000);
741	r600_store_value(cb, 0x80000000);
742
743	/* We're setting config registers here. */
744	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
745	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
746
747	switch (ctx->b.family) {
748	case CHIP_CEDAR:
749	default:
750		num_threads = 128;
751		num_stack_entries = 256;
752		break;
753	case CHIP_REDWOOD:
754		num_threads = 128;
755		num_stack_entries = 256;
756		break;
757	case CHIP_JUNIPER:
758		num_threads = 128;
759		num_stack_entries = 512;
760		break;
761	case CHIP_CYPRESS:
762	case CHIP_HEMLOCK:
763		num_threads = 128;
764		num_stack_entries = 512;
765		break;
766	case CHIP_PALM:
767		num_threads = 128;
768		num_stack_entries = 256;
769		break;
770	case CHIP_SUMO:
771		num_threads = 128;
772		num_stack_entries = 256;
773		break;
774	case CHIP_SUMO2:
775		num_threads = 128;
776		num_stack_entries = 512;
777		break;
778	case CHIP_BARTS:
779		num_threads = 128;
780		num_stack_entries = 512;
781		break;
782	case CHIP_TURKS:
783		num_threads = 128;
784		num_stack_entries = 256;
785		break;
786	case CHIP_CAICOS:
787		num_threads = 128;
788		num_stack_entries = 256;
789		break;
790	}
791
792	/* Config Registers */
793	if (ctx->b.chip_class < CAYMAN)
794		evergreen_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
795					   ctx->screen->b.info.drm_minor);
796	else
797		cayman_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
798					ctx->screen->b.info.drm_minor);
799
800	/* The primitive type always needs to be POINTLIST for compute. */
801	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
802						V_008958_DI_PT_POINTLIST);
803
804	if (ctx->b.chip_class < CAYMAN) {
805
806		/* These registers control which simds can be used by each stage.
807		 * The default for these registers is 0xffffffff, which means
808		 * all simds are available for each stage.  It's possible we may
809		 * want to play around with these in the future, but for now
810		 * the default value is fine.
811		 *
812		 * R_008E20_SQ_STATIC_THREAD_MGMT1
813		 * R_008E24_SQ_STATIC_THREAD_MGMT2
814		 * R_008E28_SQ_STATIC_THREAD_MGMT3
815		 */
816
817		/* XXX: We may need to adjust the thread and stack resouce
818		 * values for 3D/compute interop */
819
820		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
821
822		/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
823		 * Set the number of threads used by the PS/VS/GS/ES stage to
824		 * 0.
825		 */
826		r600_store_value(cb, 0);
827
828		/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
829		 * Set the number of threads used by the CS (aka LS) stage to
830		 * the maximum number of threads and set the number of threads
831		 * for the HS stage to 0. */
832		r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
833
834		/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
835		 * Set the Control Flow stack entries to 0 for PS/VS stages */
836		r600_store_value(cb, 0);
837
838		/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
839		 * Set the Control Flow stack entries to 0 for GS/ES stages */
840		r600_store_value(cb, 0);
841
842		/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
843		 * Set the Contol Flow stack entries to 0 for the HS stage, and
844		 * set it to the maximum value for the CS (aka LS) stage. */
845		r600_store_value(cb,
846			S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
847	}
848	/* Give the compute shader all the available LDS space.
849	 * NOTE: This only sets the maximum number of dwords that a compute
850	 * shader can allocate.  When a shader is executed, we still need to
851	 * allocate the appropriate amount of LDS dwords using the
852	 * CM_R_0288E8_SQ_LDS_ALLOC register.
853	 */
854	if (ctx->b.chip_class < CAYMAN) {
855		r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
856			S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
857	} else {
858		r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
859			S_0286FC_NUM_PS_LDS(0) |
860			S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
861	}
862
863	/* Context Registers */
864
865	if (ctx->b.chip_class < CAYMAN) {
866		/* workaround for hw issues with dyn gpr - must set all limits
867		 * to 240 instead of 0, 0x1e == 240 / 8
868		 */
869		r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
870				S_028838_PS_GPRS(0x1e) |
871				S_028838_VS_GPRS(0x1e) |
872				S_028838_GS_GPRS(0x1e) |
873				S_028838_ES_GPRS(0x1e) |
874				S_028838_HS_GPRS(0x1e) |
875				S_028838_LS_GPRS(0x1e));
876	}
877
878	/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
879	r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
880		S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
881
882	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
883
884	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
885						S_0286E8_TID_IN_GROUP_ENA
886						| S_0286E8_TGID_ENA
887						| S_0286E8_DISABLE_INDEX_PACK)
888						;
889
890	/* The LOOP_CONST registers are an optimizations for loops that allows
891	 * you to store the initial counter, increment value, and maximum
892	 * counter value in a register so that hardware can calculate the
893	 * correct number of iterations for the loop, so that you don't need
894	 * to have the loop counter in your shader code.  We don't currently use
895	 * this optimization, so we must keep track of the counter in the
896	 * shader and use a break instruction to exit loops.  However, the
897	 * hardware will still uses this register to determine when to exit a
898	 * loop, so we need to initialize the counter to 0, set the increment
899	 * value to 1 and the maximum counter value to the 4095 (0xfff) which
900	 * is the maximum value allowed.  This gives us a maximum of 4096
901	 * iterations for our loops, but hopefully our break instruction will
902	 * execute before some time before the 4096th iteration.
903	 */
904	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
905}
906
907void evergreen_init_compute_state_functions(struct r600_context *ctx)
908{
909	ctx->b.b.create_compute_state = evergreen_create_compute_state;
910	ctx->b.b.delete_compute_state = evergreen_delete_compute_state;
911	ctx->b.b.bind_compute_state = evergreen_bind_compute_state;
912//	 ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
913	ctx->b.b.set_compute_resources = evergreen_set_compute_resources;
914	ctx->b.b.set_global_binding = evergreen_set_global_binding;
915	ctx->b.b.launch_grid = evergreen_launch_grid;
916
917}
918
919struct pipe_resource *r600_compute_global_buffer_create(
920	struct pipe_screen *screen,
921	const struct pipe_resource *templ)
922{
923	struct r600_resource_global* result = NULL;
924	struct r600_screen* rscreen = NULL;
925	int size_in_dw = 0;
926
927	assert(templ->target == PIPE_BUFFER);
928	assert(templ->bind & PIPE_BIND_GLOBAL);
929	assert(templ->array_size == 1 || templ->array_size == 0);
930	assert(templ->depth0 == 1 || templ->depth0 == 0);
931	assert(templ->height0 == 1 || templ->height0 == 0);
932
933	result = (struct r600_resource_global*)
934	CALLOC(sizeof(struct r600_resource_global), 1);
935	rscreen = (struct r600_screen*)screen;
936
937	COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
938	COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
939			templ->array_size);
940
941	result->base.b.vtbl = &r600_global_buffer_vtbl;
942	result->base.b.b.screen = screen;
943	result->base.b.b = *templ;
944	pipe_reference_init(&result->base.b.b.reference, 1);
945
946	size_in_dw = (templ->width0+3) / 4;
947
948	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
949
950	if (result->chunk == NULL)
951	{
952		free(result);
953		return NULL;
954	}
955
956	return &result->base.b.b;
957}
958
959void r600_compute_global_buffer_destroy(
960	struct pipe_screen *screen,
961	struct pipe_resource *res)
962{
963	struct r600_resource_global* buffer = NULL;
964	struct r600_screen* rscreen = NULL;
965
966	assert(res->target == PIPE_BUFFER);
967	assert(res->bind & PIPE_BIND_GLOBAL);
968
969	buffer = (struct r600_resource_global*)res;
970	rscreen = (struct r600_screen*)screen;
971
972	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
973
974	buffer->chunk = NULL;
975	free(res);
976}
977
978void *r600_compute_global_transfer_map(
979	struct pipe_context *ctx_,
980	struct pipe_resource *resource,
981	unsigned level,
982	unsigned usage,
983	const struct pipe_box *box,
984	struct pipe_transfer **ptransfer)
985{
986	struct r600_context *rctx = (struct r600_context*)ctx_;
987	struct compute_memory_pool *pool = rctx->screen->global_pool;
988	struct r600_resource_global* buffer =
989		(struct r600_resource_global*)resource;
990
991	struct compute_memory_item *item = buffer->chunk;
992	struct pipe_resource *dst = NULL;
993	unsigned offset = box->x;
994
995	if (is_item_in_pool(item)) {
996		compute_memory_demote_item(pool, item, ctx_);
997	}
998	else {
999		if (item->real_buffer == NULL) {
1000			item->real_buffer = (struct r600_resource*)
1001					r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1002		}
1003	}
1004
1005	dst = (struct pipe_resource*)item->real_buffer;
1006
1007	if (usage & PIPE_TRANSFER_READ)
1008		buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1009
1010	COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1011			"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1012			"width = %u, height = %u, depth = %u)\n", level, usage,
1013			box->x, box->y, box->z, box->width, box->height,
1014			box->depth);
1015	COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1016		"%u (box.x)\n", item->id, box->x);
1017
1018
1019	assert(resource->target == PIPE_BUFFER);
1020	assert(resource->bind & PIPE_BIND_GLOBAL);
1021	assert(box->x >= 0);
1022	assert(box->y == 0);
1023	assert(box->z == 0);
1024
1025	///TODO: do it better, mapping is not possible if the pool is too big
1026	return pipe_buffer_map_range(ctx_, dst,
1027			offset, box->width, usage, ptransfer);
1028}
1029
1030void r600_compute_global_transfer_unmap(
1031	struct pipe_context *ctx_,
1032	struct pipe_transfer* transfer)
1033{
1034	/* struct r600_resource_global are not real resources, they just map
1035	 * to an offset within the compute memory pool.  The function
1036	 * r600_compute_global_transfer_map() maps the memory pool
1037	 * resource rather than the struct r600_resource_global passed to
1038	 * it as an argument and then initalizes ptransfer->resource with
1039	 * the memory pool resource (via pipe_buffer_map_range).
1040	 * When transfer_unmap is called it uses the memory pool's
1041	 * vtable which calls r600_buffer_transfer_map() rather than
1042	 * this function.
1043	 */
1044	assert (!"This function should not be called");
1045}
1046
1047void r600_compute_global_transfer_flush_region(
1048	struct pipe_context *ctx_,
1049	struct pipe_transfer *transfer,
1050	const struct pipe_box *box)
1051{
1052	assert(0 && "TODO");
1053}
1054
1055void r600_compute_global_transfer_inline_write(
1056	struct pipe_context *pipe,
1057	struct pipe_resource *resource,
1058	unsigned level,
1059	unsigned usage,
1060	const struct pipe_box *box,
1061	const void *data,
1062	unsigned stride,
1063	unsigned layer_stride)
1064{
1065	assert(0 && "TODO");
1066}
1067