evergreen_compute.c revision e377037bef521a985dc801371f195ada327ec304
1/*
2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 *      Adam Rak <adam.rak@streamnovation.com>
25 */
26
27#include <stdio.h>
28#include <errno.h>
29#include "pipe/p_defines.h"
30#include "pipe/p_state.h"
31#include "pipe/p_context.h"
32#include "util/u_blitter.h"
33#include "util/list.h"
34#include "util/u_transfer.h"
35#include "util/u_surface.h"
36#include "util/u_pack_color.h"
37#include "util/u_memory.h"
38#include "util/u_inlines.h"
39#include "util/u_framebuffer.h"
40#include "pipebuffer/pb_buffer.h"
41#include "evergreend.h"
42#include "r600_shader.h"
43#include "r600_pipe.h"
44#include "r600_formats.h"
45#include "evergreen_compute.h"
46#include "evergreen_compute_internal.h"
47#include "compute_memory_pool.h"
48#include "sb/sb_public.h"
49#ifdef HAVE_OPENCL
50#include "radeon/radeon_llvm_util.h"
51#endif
52#include "radeon/radeon_elf_util.h"
53#include <inttypes.h>
54
55/**
56RAT0 is for global binding write
57VTX1 is for global binding read
58
59for wrting images RAT1...
60for reading images TEX2...
61  TEX2-RAT1 is paired
62
63TEX2... consumes the same fetch resources, that VTX2... would consume
64
65CONST0 and VTX0 is for parameters
66  CONST0 is binding smaller input parameter buffer, and for constant indexing,
67  also constant cached
68  VTX0 is for indirect/non-constant indexing, or if the input is bigger than
69  the constant cache can handle
70
71RAT-s are limited to 12, so we can only bind at most 11 texture for writing
72because we reserve RAT0 for global bindings. With byteaddressing enabled,
73we should reserve another one too.=> 10 image binding for writing max.
74
75from Nvidia OpenCL:
76  CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
77  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
78
79so 10 for writing is enough. 176 is the max for reading according to the docs
80
81writable images should be listed first < 10, so their id corresponds to RAT(id+1)
82writable images will consume TEX slots, VTX slots too because of linear indexing
83
84*/
85
86struct r600_resource* r600_compute_buffer_alloc_vram(
87       struct r600_screen *screen,
88       unsigned size)
89{
90	struct pipe_resource * buffer = NULL;
91	assert(size);
92
93	buffer = pipe_buffer_create(
94		(struct pipe_screen*) screen,
95		PIPE_BIND_CUSTOM,
96		PIPE_USAGE_IMMUTABLE,
97		size);
98
99	return (struct r600_resource *)buffer;
100}
101
102
103static void evergreen_set_rat(
104	struct r600_pipe_compute *pipe,
105	unsigned id,
106	struct r600_resource* bo,
107	int start,
108	int size)
109{
110	struct pipe_surface rat_templ;
111	struct r600_surface *surf = NULL;
112	struct r600_context *rctx = NULL;
113
114	assert(id < 12);
115	assert((size & 3) == 0);
116	assert((start & 0xFF) == 0);
117
118	rctx = pipe->ctx;
119
120	COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
121
122	/* Create the RAT surface */
123	memset(&rat_templ, 0, sizeof(rat_templ));
124	rat_templ.format = PIPE_FORMAT_R32_UINT;
125	rat_templ.u.tex.level = 0;
126	rat_templ.u.tex.first_layer = 0;
127	rat_templ.u.tex.last_layer = 0;
128
129	/* Add the RAT the list of color buffers */
130	pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
131		(struct pipe_context *)pipe->ctx,
132		(struct pipe_resource *)bo, &rat_templ);
133
134	/* Update the number of color buffers */
135	pipe->ctx->framebuffer.state.nr_cbufs =
136		MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
137
138	/* Update the cb_target_mask
139	 * XXX: I think this is a potential spot for bugs once we start doing
140	 * GL interop.  cb_target_mask may be modified in the 3D sections
141	 * of this driver. */
142	pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
143
144	surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
145	evergreen_init_color_surface_rat(rctx, surf);
146}
147
148static void evergreen_cs_set_vertex_buffer(
149	struct r600_context * rctx,
150	unsigned vb_index,
151	unsigned offset,
152	struct pipe_resource * buffer)
153{
154	struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
155	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
156	vb->stride = 1;
157	vb->buffer_offset = offset;
158	vb->buffer = buffer;
159	vb->user_buffer = NULL;
160
161	/* The vertex instructions in the compute shaders use the texture cache,
162	 * so we need to invalidate it. */
163	rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
164	state->enabled_mask |= 1 << vb_index;
165	state->dirty_mask |= 1 << vb_index;
166	r600_mark_atom_dirty(rctx, &state->atom);
167}
168
169static void evergreen_cs_set_constant_buffer(
170	struct r600_context * rctx,
171	unsigned cb_index,
172	unsigned offset,
173	unsigned size,
174	struct pipe_resource * buffer)
175{
176	struct pipe_constant_buffer cb;
177	cb.buffer_size = size;
178	cb.buffer_offset = offset;
179	cb.buffer = buffer;
180	cb.user_buffer = NULL;
181
182	rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
183}
184
185static const struct u_resource_vtbl r600_global_buffer_vtbl =
186{
187	u_default_resource_get_handle, /* get_handle */
188	r600_compute_global_buffer_destroy, /* resource_destroy */
189	r600_compute_global_transfer_map, /* transfer_map */
190	r600_compute_global_transfer_flush_region,/* transfer_flush_region */
191	r600_compute_global_transfer_unmap, /* transfer_unmap */
192	r600_compute_global_transfer_inline_write /* transfer_inline_write */
193};
194
195
196void *evergreen_create_compute_state(
197	struct pipe_context *ctx_,
198	const const struct pipe_compute_state *cso)
199{
200	struct r600_context *ctx = (struct r600_context *)ctx_;
201	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
202#ifdef HAVE_OPENCL
203	const struct pipe_llvm_program_header * header;
204	const char *code;
205	void *p;
206	boolean use_kill;
207
208	COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
209	header = cso->prog;
210	code = cso->prog + sizeof(struct pipe_llvm_program_header);
211#if HAVE_LLVM < 0x0306
212        (void)use_kill;
213	(void)p;
214	shader->llvm_ctx = LLVMContextCreate();
215	shader->num_kernels = radeon_llvm_get_num_kernels(shader->llvm_ctx,
216				code, header->num_bytes);
217	shader->kernels = CALLOC(sizeof(struct r600_kernel),
218				shader->num_kernels);
219	{
220		unsigned i;
221		for (i = 0; i < shader->num_kernels; i++) {
222			struct r600_kernel *kernel = &shader->kernels[i];
223			kernel->llvm_module = radeon_llvm_get_kernel_module(
224				shader->llvm_ctx, i, code, header->num_bytes);
225		}
226	}
227#else
228	radeon_shader_binary_init(&shader->binary);
229	radeon_elf_read(code, header->num_bytes, &shader->binary);
230	r600_create_shader(&shader->bc, &shader->binary, &use_kill);
231
232	shader->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
233							shader->bc.ndw * 4);
234	p = r600_buffer_map_sync_with_rings(&ctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
235	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
236	ctx->b.ws->buffer_unmap(shader->code_bo->buf);
237#endif
238#endif
239
240	shader->ctx = ctx;
241	shader->local_size = cso->req_local_mem;
242	shader->private_size = cso->req_private_mem;
243	shader->input_size = cso->req_input_mem;
244
245	return shader;
246}
247
248void evergreen_delete_compute_state(struct pipe_context *ctx_, void* state)
249{
250	struct r600_context *ctx = (struct r600_context *)ctx_;
251	COMPUTE_DBG(ctx->screen, "*** evergreen_delete_compute_state\n");
252	struct r600_pipe_compute *shader = state;
253
254	if (!shader)
255		return;
256
257#ifdef HAVE_OPENCL
258#if HAVE_LLVM < 0x0306
259	for (unsigned i = 0; i < shader->num_kernels; i++) {
260		struct r600_kernel *kernel = &shader->kernels[i];
261		LLVMDisposeModule(module);
262	}
263	FREE(shader->kernels);
264	LLVMContextDispose(shader->llvm_ctx);
265#else
266	radeon_shader_binary_clean(&shader->binary);
267	r600_destroy_shader(&shader->bc);
268
269	/* TODO destroy shader->code_bo, shader->const_bo
270	 * we'll need something like r600_buffer_free */
271#endif
272#endif
273	FREE(shader);
274}
275
276static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
277{
278	struct r600_context *ctx = (struct r600_context *)ctx_;
279
280	COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
281
282	ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
283}
284
285/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
286 * kernel parameters there are implicit parameters that need to be stored
287 * in the vertex buffer as well.  Here is how these parameters are organized in
288 * the buffer:
289 *
290 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
291 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
292 * DWORDS 6-8: Number of work items within each work group in each dimension
293 *             (x,y,z)
294 * DWORDS 9+ : Kernel parameters
295 */
296void evergreen_compute_upload_input(
297	struct pipe_context *ctx_,
298	const uint *block_layout,
299	const uint *grid_layout,
300	const void *input)
301{
302	struct r600_context *ctx = (struct r600_context *)ctx_;
303	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
304	unsigned i;
305	/* We need to reserve 9 dwords (36 bytes) for implicit kernel
306	 * parameters.
307	 */
308	unsigned input_size = shader->input_size + 36;
309	uint32_t * num_work_groups_start;
310	uint32_t * global_size_start;
311	uint32_t * local_size_start;
312	uint32_t * kernel_parameters_start;
313	struct pipe_box box;
314	struct pipe_transfer *transfer = NULL;
315
316	if (shader->input_size == 0) {
317		return;
318	}
319
320	if (!shader->kernel_param) {
321		/* Add space for the grid dimensions */
322		shader->kernel_param = (struct r600_resource *)
323			pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
324					PIPE_USAGE_IMMUTABLE, input_size);
325	}
326
327	u_box_1d(0, input_size, &box);
328	num_work_groups_start = ctx_->transfer_map(ctx_,
329			(struct pipe_resource*)shader->kernel_param,
330			0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
331			&box, &transfer);
332	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
333	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
334	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
335
336	/* Copy the work group size */
337	memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
338
339	/* Copy the global size */
340	for (i = 0; i < 3; i++) {
341		global_size_start[i] = grid_layout[i] * block_layout[i];
342	}
343
344	/* Copy the local dimensions */
345	memcpy(local_size_start, block_layout, 3 * sizeof(uint));
346
347	/* Copy the kernel inputs */
348	memcpy(kernel_parameters_start, input, shader->input_size);
349
350	for (i = 0; i < (input_size / 4); i++) {
351		COMPUTE_DBG(ctx->screen, "input %i : %u\n", i,
352			((unsigned*)num_work_groups_start)[i]);
353	}
354
355	ctx_->transfer_unmap(ctx_, transfer);
356
357	/* ID=0 is reserved for the parameters */
358	evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
359			(struct pipe_resource*)shader->kernel_param);
360}
361
362static void evergreen_emit_direct_dispatch(
363		struct r600_context *rctx,
364		const uint *block_layout, const uint *grid_layout)
365{
366	int i;
367	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
368	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
369	unsigned num_waves;
370	unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
371	unsigned wave_divisor = (16 * num_pipes);
372	int group_size = 1;
373	int grid_size = 1;
374	unsigned lds_size = shader->local_size / 4 +
375#if HAVE_LLVM < 0x0306
376		shader->active_kernel->bc.nlds_dw;
377#else
378		shader->bc.nlds_dw;
379#endif
380
381
382	/* Calculate group_size/grid_size */
383	for (i = 0; i < 3; i++) {
384		group_size *= block_layout[i];
385	}
386
387	for (i = 0; i < 3; i++)	{
388		grid_size *= grid_layout[i];
389	}
390
391	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
392	num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
393			wave_divisor - 1) / wave_divisor;
394
395	COMPUTE_DBG(rctx->screen, "Using %u pipes, "
396				"%u wavefronts per thread block, "
397				"allocating %u dwords lds.\n",
398				num_pipes, num_waves, lds_size);
399
400	radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
401
402	radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
403	radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
404	radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
405	radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
406
407	radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
408								group_size);
409
410	radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
411	radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
412	radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
413	radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
414
415	if (rctx->b.chip_class < CAYMAN) {
416		assert(lds_size <= 8192);
417	} else {
418		/* Cayman appears to have a slightly smaller limit, see the
419		 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
420		assert(lds_size <= 8160);
421	}
422
423	radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
424					lds_size | (num_waves << 14));
425
426	/* Dispatch packet */
427	radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
428	radeon_emit(cs, grid_layout[0]);
429	radeon_emit(cs, grid_layout[1]);
430	radeon_emit(cs, grid_layout[2]);
431	/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
432	radeon_emit(cs, 1);
433}
434
435static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
436		const uint *grid_layout)
437{
438	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
439	unsigned i;
440
441	/* make sure that the gfx ring is only one active */
442	if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) {
443		ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
444	}
445
446	/* Initialize all the compute-related registers.
447	 *
448	 * See evergreen_init_atom_start_compute_cs() in this file for the list
449	 * of registers initialized by the start_compute_cs_cmd atom.
450	 */
451	r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
452
453	/* emit config state */
454	if (ctx->b.chip_class == EVERGREEN)
455		r600_emit_atom(ctx, &ctx->config_state.atom);
456
457	ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
458	r600_flush_emit(ctx);
459
460	/* Emit colorbuffers. */
461	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
462	for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
463		struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
464		unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx,
465						       (struct r600_resource*)cb->base.texture,
466						       RADEON_USAGE_READWRITE,
467						       RADEON_PRIO_SHADER_RW_BUFFER);
468
469		radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
470		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
471		radeon_emit(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
472		radeon_emit(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
473		radeon_emit(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
474		radeon_emit(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
475		radeon_emit(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
476		radeon_emit(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */
477
478		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
479		radeon_emit(cs, reloc);
480
481		if (!ctx->keep_tiling_flags) {
482			radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
483			radeon_emit(cs, reloc);
484		}
485
486		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
487		radeon_emit(cs, reloc);
488	}
489	if (ctx->keep_tiling_flags) {
490		for (; i < 8 ; i++) {
491			radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
492						       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
493		}
494		for (; i < 12; i++) {
495			radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
496						       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
497		}
498	}
499
500	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
501	radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
502					ctx->compute_cb_target_mask);
503
504
505	/* Emit vertex buffer state */
506	ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
507	r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
508
509	/* Emit constant buffer state */
510	r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
511
512	/* Emit sampler state */
513	r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
514
515	/* Emit sampler view (texture resource) state */
516	r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
517
518	/* Emit compute shader state */
519	r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
520
521	/* Emit dispatch state and dispatch packet */
522	evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
523
524	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
525	 */
526	ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
527		      R600_CONTEXT_INV_VERTEX_CACHE |
528	              R600_CONTEXT_INV_TEX_CACHE;
529	r600_flush_emit(ctx);
530	ctx->b.flags = 0;
531
532	if (ctx->b.chip_class >= CAYMAN) {
533		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
534		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
535		/* DEALLOC_STATE prevents the GPU from hanging when a
536		 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
537		 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
538		 */
539		cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
540		cs->buf[cs->cdw++] = 0;
541	}
542
543#if 0
544	COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
545	for (i = 0; i < cs->cdw; i++) {
546		COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
547	}
548#endif
549
550}
551
552
553/**
554 * Emit function for r600_cs_shader_state atom
555 */
556void evergreen_emit_cs_shader(
557		struct r600_context *rctx,
558		struct r600_atom *atom)
559{
560	struct r600_cs_shader_state *state =
561					(struct r600_cs_shader_state*)atom;
562	struct r600_pipe_compute *shader = state->shader;
563	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
564	uint64_t va;
565	struct r600_resource *code_bo;
566	unsigned ngpr, nstack;
567
568#if HAVE_LLVM < 0x0306
569	struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
570	code_bo = kernel->code_bo;
571	va = kernel->code_bo->gpu_address;
572	ngpr = kernel->bc.ngpr;
573	nstack = kernel->bc.nstack;
574#else
575	code_bo = shader->code_bo;
576	va = shader->code_bo->gpu_address + state->pc;
577	ngpr = shader->bc.ngpr;
578	nstack = shader->bc.nstack;
579#endif
580
581	radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
582	radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
583	radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
584			S_0288D4_NUM_GPRS(ngpr)
585			| S_0288D4_STACK_SIZE(nstack));
586	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
587
588	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
589	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
590					      code_bo, RADEON_USAGE_READ,
591					      RADEON_PRIO_USER_SHADER));
592}
593
594static void evergreen_launch_grid(
595		struct pipe_context *ctx_,
596		const uint *block_layout, const uint *grid_layout,
597		uint32_t pc, const void *input)
598{
599	struct r600_context *ctx = (struct r600_context *)ctx_;
600#ifdef HAVE_OPENCL
601	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
602	boolean use_kill;
603
604#if HAVE_LLVM < 0x0306
605	struct r600_kernel *kernel = &shader->kernels[pc];
606	(void)use_kill;
607        if (!kernel->code_bo) {
608                void *p;
609                struct r600_bytecode *bc = &kernel->bc;
610                LLVMModuleRef mod = kernel->llvm_module;
611                boolean use_kill = false;
612                bool dump = (ctx->screen->b.debug_flags & DBG_CS) != 0;
613                unsigned use_sb = ctx->screen->b.debug_flags & DBG_SB_CS;
614                unsigned sb_disasm = use_sb ||
615                        (ctx->screen->b.debug_flags & DBG_SB_DISASM);
616
617                r600_bytecode_init(bc, ctx->b.chip_class, ctx->b.family,
618                           ctx->screen->has_compressed_msaa_texturing);
619                bc->type = TGSI_PROCESSOR_COMPUTE;
620                bc->isa = ctx->isa;
621                r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump, &ctx->b.debug);
622
623                if (dump && !sb_disasm) {
624                        r600_bytecode_disasm(bc);
625                } else if ((dump && sb_disasm) || use_sb) {
626                        if (r600_sb_bytecode_process(ctx, bc, NULL, dump, use_sb))
627                                R600_ERR("r600_sb_bytecode_process failed!\n");
628                }
629
630                kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
631                                                        kernel->bc.ndw * 4);
632                p = r600_buffer_map_sync_with_rings(&ctx->b, kernel->code_bo, PIPE_TRANSFER_WRITE);
633                memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
634                ctx->b.ws->buffer_unmap(kernel->code_bo->buf);
635        }
636	shader->active_kernel = kernel;
637	ctx->cs_shader_state.kernel_index = pc;
638#else
639	ctx->cs_shader_state.pc = pc;
640	/* Get the config information for this kernel. */
641	r600_shader_binary_read_config(&shader->binary, &shader->bc, pc, &use_kill);
642#endif
643#endif
644
645	COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);
646
647
648	evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
649	compute_emit_cs(ctx, block_layout, grid_layout);
650}
651
652static void evergreen_set_compute_resources(struct pipe_context * ctx_,
653		unsigned start, unsigned count,
654		struct pipe_surface ** surfaces)
655{
656	struct r600_context *ctx = (struct r600_context *)ctx_;
657	struct r600_surface **resources = (struct r600_surface **)surfaces;
658
659	COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
660			start, count);
661
662	for (unsigned i = 0; i < count; i++) {
663		/* The First two vertex buffers are reserved for parameters and
664		 * global buffers. */
665		unsigned vtx_id = 2 + i;
666		if (resources[i]) {
667			struct r600_resource_global *buffer =
668				(struct r600_resource_global*)
669				resources[i]->base.texture;
670			if (resources[i]->base.writable) {
671				assert(i+1 < 12);
672
673				evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
674				(struct r600_resource *)resources[i]->base.texture,
675				buffer->chunk->start_in_dw*4,
676				resources[i]->base.texture->width0);
677			}
678
679			evergreen_cs_set_vertex_buffer(ctx, vtx_id,
680					buffer->chunk->start_in_dw * 4,
681					resources[i]->base.texture);
682		}
683	}
684}
685
686static void evergreen_set_global_binding(
687	struct pipe_context *ctx_, unsigned first, unsigned n,
688	struct pipe_resource **resources,
689	uint32_t **handles)
690{
691	struct r600_context *ctx = (struct r600_context *)ctx_;
692	struct compute_memory_pool *pool = ctx->screen->global_pool;
693	struct r600_resource_global **buffers =
694		(struct r600_resource_global **)resources;
695	unsigned i;
696
697	COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
698			first, n);
699
700	if (!resources) {
701		/* XXX: Unset */
702		return;
703	}
704
705	/* We mark these items for promotion to the pool if they
706	 * aren't already there */
707	for (i = first; i < first + n; i++) {
708		struct compute_memory_item *item = buffers[i]->chunk;
709
710		if (!is_item_in_pool(item))
711			buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
712	}
713
714	if (compute_memory_finalize_pending(pool, ctx_) == -1) {
715		/* XXX: Unset */
716		return;
717	}
718
719	for (i = first; i < first + n; i++)
720	{
721		uint32_t buffer_offset;
722		uint32_t handle;
723		assert(resources[i]->target == PIPE_BUFFER);
724		assert(resources[i]->bind & PIPE_BIND_GLOBAL);
725
726		buffer_offset = util_le32_to_cpu(*(handles[i]));
727		handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
728
729		*(handles[i]) = util_cpu_to_le32(handle);
730	}
731
732	evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
733	evergreen_cs_set_vertex_buffer(ctx, 1, 0,
734				(struct pipe_resource*)pool->bo);
735}
736
737/**
738 * This function initializes all the compute specific registers that need to
739 * be initialized for each compute command stream.  Registers that are common
740 * to both compute and 3D will be initialized at the beginning of each compute
741 * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
742 * packet requires that the shader type bit be set, we must initialize all
743 * context registers needed for compute in this function.  The registers
744 * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
745 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
746 * on the GPU family.
747 */
748void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
749{
750	struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
751	int num_threads;
752	int num_stack_entries;
753
754	/* since all required registers are initialized in the
755	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
756	 */
757	r600_init_command_buffer(cb, 256);
758	cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
759
760	/* This must be first. */
761	r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
762	r600_store_value(cb, 0x80000000);
763	r600_store_value(cb, 0x80000000);
764
765	/* We're setting config registers here. */
766	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
767	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
768
769	switch (ctx->b.family) {
770	case CHIP_CEDAR:
771	default:
772		num_threads = 128;
773		num_stack_entries = 256;
774		break;
775	case CHIP_REDWOOD:
776		num_threads = 128;
777		num_stack_entries = 256;
778		break;
779	case CHIP_JUNIPER:
780		num_threads = 128;
781		num_stack_entries = 512;
782		break;
783	case CHIP_CYPRESS:
784	case CHIP_HEMLOCK:
785		num_threads = 128;
786		num_stack_entries = 512;
787		break;
788	case CHIP_PALM:
789		num_threads = 128;
790		num_stack_entries = 256;
791		break;
792	case CHIP_SUMO:
793		num_threads = 128;
794		num_stack_entries = 256;
795		break;
796	case CHIP_SUMO2:
797		num_threads = 128;
798		num_stack_entries = 512;
799		break;
800	case CHIP_BARTS:
801		num_threads = 128;
802		num_stack_entries = 512;
803		break;
804	case CHIP_TURKS:
805		num_threads = 128;
806		num_stack_entries = 256;
807		break;
808	case CHIP_CAICOS:
809		num_threads = 128;
810		num_stack_entries = 256;
811		break;
812	}
813
814	/* Config Registers */
815	if (ctx->b.chip_class < CAYMAN)
816		evergreen_init_common_regs(ctx, cb, ctx->b.chip_class, ctx->b.family,
817					   ctx->screen->b.info.drm_minor);
818	else
819		cayman_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
820					ctx->screen->b.info.drm_minor);
821
822	/* The primitive type always needs to be POINTLIST for compute. */
823	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
824						V_008958_DI_PT_POINTLIST);
825
826	if (ctx->b.chip_class < CAYMAN) {
827
828		/* These registers control which simds can be used by each stage.
829		 * The default for these registers is 0xffffffff, which means
830		 * all simds are available for each stage.  It's possible we may
831		 * want to play around with these in the future, but for now
832		 * the default value is fine.
833		 *
834		 * R_008E20_SQ_STATIC_THREAD_MGMT1
835		 * R_008E24_SQ_STATIC_THREAD_MGMT2
836		 * R_008E28_SQ_STATIC_THREAD_MGMT3
837		 */
838
839		/* XXX: We may need to adjust the thread and stack resource
840		 * values for 3D/compute interop */
841
842		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
843
844		/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
845		 * Set the number of threads used by the PS/VS/GS/ES stage to
846		 * 0.
847		 */
848		r600_store_value(cb, 0);
849
850		/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
851		 * Set the number of threads used by the CS (aka LS) stage to
852		 * the maximum number of threads and set the number of threads
853		 * for the HS stage to 0. */
854		r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
855
856		/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
857		 * Set the Control Flow stack entries to 0 for PS/VS stages */
858		r600_store_value(cb, 0);
859
860		/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
861		 * Set the Control Flow stack entries to 0 for GS/ES stages */
862		r600_store_value(cb, 0);
863
864		/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
865		 * Set the Contol Flow stack entries to 0 for the HS stage, and
866		 * set it to the maximum value for the CS (aka LS) stage. */
867		r600_store_value(cb,
868			S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
869	}
870	/* Give the compute shader all the available LDS space.
871	 * NOTE: This only sets the maximum number of dwords that a compute
872	 * shader can allocate.  When a shader is executed, we still need to
873	 * allocate the appropriate amount of LDS dwords using the
874	 * CM_R_0288E8_SQ_LDS_ALLOC register.
875	 */
876	if (ctx->b.chip_class < CAYMAN) {
877		r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
878			S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
879	} else {
880		r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
881			S_0286FC_NUM_PS_LDS(0) |
882			S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
883	}
884
885	/* Context Registers */
886
887	if (ctx->b.chip_class < CAYMAN) {
888		/* workaround for hw issues with dyn gpr - must set all limits
889		 * to 240 instead of 0, 0x1e == 240 / 8
890		 */
891		r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
892				S_028838_PS_GPRS(0x1e) |
893				S_028838_VS_GPRS(0x1e) |
894				S_028838_GS_GPRS(0x1e) |
895				S_028838_ES_GPRS(0x1e) |
896				S_028838_HS_GPRS(0x1e) |
897				S_028838_LS_GPRS(0x1e));
898	}
899
900	/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
901	r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
902		S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
903
904	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
905
906	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
907						S_0286E8_TID_IN_GROUP_ENA
908						| S_0286E8_TGID_ENA
909						| S_0286E8_DISABLE_INDEX_PACK)
910						;
911
912	/* The LOOP_CONST registers are an optimizations for loops that allows
913	 * you to store the initial counter, increment value, and maximum
914	 * counter value in a register so that hardware can calculate the
915	 * correct number of iterations for the loop, so that you don't need
916	 * to have the loop counter in your shader code.  We don't currently use
917	 * this optimization, so we must keep track of the counter in the
918	 * shader and use a break instruction to exit loops.  However, the
919	 * hardware will still uses this register to determine when to exit a
920	 * loop, so we need to initialize the counter to 0, set the increment
921	 * value to 1 and the maximum counter value to the 4095 (0xfff) which
922	 * is the maximum value allowed.  This gives us a maximum of 4096
923	 * iterations for our loops, but hopefully our break instruction will
924	 * execute before some time before the 4096th iteration.
925	 */
926	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
927}
928
929void evergreen_init_compute_state_functions(struct r600_context *ctx)
930{
931	ctx->b.b.create_compute_state = evergreen_create_compute_state;
932	ctx->b.b.delete_compute_state = evergreen_delete_compute_state;
933	ctx->b.b.bind_compute_state = evergreen_bind_compute_state;
934//	 ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
935	ctx->b.b.set_compute_resources = evergreen_set_compute_resources;
936	ctx->b.b.set_global_binding = evergreen_set_global_binding;
937	ctx->b.b.launch_grid = evergreen_launch_grid;
938
939}
940
941struct pipe_resource *r600_compute_global_buffer_create(
942	struct pipe_screen *screen,
943	const struct pipe_resource *templ)
944{
945	struct r600_resource_global* result = NULL;
946	struct r600_screen* rscreen = NULL;
947	int size_in_dw = 0;
948
949	assert(templ->target == PIPE_BUFFER);
950	assert(templ->bind & PIPE_BIND_GLOBAL);
951	assert(templ->array_size == 1 || templ->array_size == 0);
952	assert(templ->depth0 == 1 || templ->depth0 == 0);
953	assert(templ->height0 == 1 || templ->height0 == 0);
954
955	result = (struct r600_resource_global*)
956	CALLOC(sizeof(struct r600_resource_global), 1);
957	rscreen = (struct r600_screen*)screen;
958
959	COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
960	COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
961			templ->array_size);
962
963	result->base.b.vtbl = &r600_global_buffer_vtbl;
964	result->base.b.b = *templ;
965	result->base.b.b.screen = screen;
966	pipe_reference_init(&result->base.b.b.reference, 1);
967
968	size_in_dw = (templ->width0+3) / 4;
969
970	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
971
972	if (result->chunk == NULL)
973	{
974		free(result);
975		return NULL;
976	}
977
978	return &result->base.b.b;
979}
980
981void r600_compute_global_buffer_destroy(
982	struct pipe_screen *screen,
983	struct pipe_resource *res)
984{
985	struct r600_resource_global* buffer = NULL;
986	struct r600_screen* rscreen = NULL;
987
988	assert(res->target == PIPE_BUFFER);
989	assert(res->bind & PIPE_BIND_GLOBAL);
990
991	buffer = (struct r600_resource_global*)res;
992	rscreen = (struct r600_screen*)screen;
993
994	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
995
996	buffer->chunk = NULL;
997	free(res);
998}
999
1000void *r600_compute_global_transfer_map(
1001	struct pipe_context *ctx_,
1002	struct pipe_resource *resource,
1003	unsigned level,
1004	unsigned usage,
1005	const struct pipe_box *box,
1006	struct pipe_transfer **ptransfer)
1007{
1008	struct r600_context *rctx = (struct r600_context*)ctx_;
1009	struct compute_memory_pool *pool = rctx->screen->global_pool;
1010	struct r600_resource_global* buffer =
1011		(struct r600_resource_global*)resource;
1012
1013	struct compute_memory_item *item = buffer->chunk;
1014	struct pipe_resource *dst = NULL;
1015	unsigned offset = box->x;
1016
1017	if (is_item_in_pool(item)) {
1018		compute_memory_demote_item(pool, item, ctx_);
1019	}
1020	else {
1021		if (item->real_buffer == NULL) {
1022			item->real_buffer =
1023					r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1024		}
1025	}
1026
1027	dst = (struct pipe_resource*)item->real_buffer;
1028
1029	if (usage & PIPE_TRANSFER_READ)
1030		buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1031
1032	COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1033			"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1034			"width = %u, height = %u, depth = %u)\n", level, usage,
1035			box->x, box->y, box->z, box->width, box->height,
1036			box->depth);
1037	COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1038		"%u (box.x)\n", item->id, box->x);
1039
1040
1041	assert(resource->target == PIPE_BUFFER);
1042	assert(resource->bind & PIPE_BIND_GLOBAL);
1043	assert(box->x >= 0);
1044	assert(box->y == 0);
1045	assert(box->z == 0);
1046
1047	///TODO: do it better, mapping is not possible if the pool is too big
1048	return pipe_buffer_map_range(ctx_, dst,
1049			offset, box->width, usage, ptransfer);
1050}
1051
1052void r600_compute_global_transfer_unmap(
1053	struct pipe_context *ctx_,
1054	struct pipe_transfer* transfer)
1055{
1056	/* struct r600_resource_global are not real resources, they just map
1057	 * to an offset within the compute memory pool.  The function
1058	 * r600_compute_global_transfer_map() maps the memory pool
1059	 * resource rather than the struct r600_resource_global passed to
1060	 * it as an argument and then initalizes ptransfer->resource with
1061	 * the memory pool resource (via pipe_buffer_map_range).
1062	 * When transfer_unmap is called it uses the memory pool's
1063	 * vtable which calls r600_buffer_transfer_map() rather than
1064	 * this function.
1065	 */
1066	assert (!"This function should not be called");
1067}
1068
1069void r600_compute_global_transfer_flush_region(
1070	struct pipe_context *ctx_,
1071	struct pipe_transfer *transfer,
1072	const struct pipe_box *box)
1073{
1074	assert(0 && "TODO");
1075}
1076
1077void r600_compute_global_transfer_inline_write(
1078	struct pipe_context *pipe,
1079	struct pipe_resource *resource,
1080	unsigned level,
1081	unsigned usage,
1082	const struct pipe_box *box,
1083	const void *data,
1084	unsigned stride,
1085	unsigned layer_stride)
1086{
1087	assert(0 && "TODO");
1088}
1089