evergreen_compute.c revision 0bd858d7ff4a16228164e3157aca846edeb6c228
1/*
2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 *      Adam Rak <adam.rak@streamnovation.com>
25 */
26
27#include <stdio.h>
28#include <errno.h>
29#include "pipe/p_defines.h"
30#include "pipe/p_state.h"
31#include "pipe/p_context.h"
32#include "util/u_blitter.h"
33#include "util/u_double_list.h"
34#include "util/u_transfer.h"
35#include "util/u_surface.h"
36#include "util/u_pack_color.h"
37#include "util/u_memory.h"
38#include "util/u_inlines.h"
39#include "util/u_framebuffer.h"
40#include "pipebuffer/pb_buffer.h"
41#include "evergreend.h"
42#include "r600_resource.h"
43#include "r600_shader.h"
44#include "r600_pipe.h"
45#include "r600_formats.h"
46#include "evergreen_compute.h"
47#include "evergreen_compute_internal.h"
48#include "compute_memory_pool.h"
49#include "sb/sb_public.h"
50#ifdef HAVE_OPENCL
51#include "radeon_llvm_util.h"
52#endif
53
54/**
55RAT0 is for global binding write
56VTX1 is for global binding read
57
58for wrting images RAT1...
59for reading images TEX2...
60  TEX2-RAT1 is paired
61
62TEX2... consumes the same fetch resources, that VTX2... would consume
63
64CONST0 and VTX0 is for parameters
65  CONST0 is binding smaller input parameter buffer, and for constant indexing,
66  also constant cached
67  VTX0 is for indirect/non-constant indexing, or if the input is bigger than
68  the constant cache can handle
69
70RAT-s are limited to 12, so we can only bind at most 11 texture for writing
71because we reserve RAT0 for global bindings. With byteaddressing enabled,
72we should reserve another one too.=> 10 image binding for writing max.
73
74from Nvidia OpenCL:
75  CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
76  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
77
78so 10 for writing is enough. 176 is the max for reading according to the docs
79
80writable images should be listed first < 10, so their id corresponds to RAT(id+1)
81writable images will consume TEX slots, VTX slots too because of linear indexing
82
83*/
84
85struct r600_resource* r600_compute_buffer_alloc_vram(
86       struct r600_screen *screen,
87       unsigned size)
88{
89	struct pipe_resource * buffer = NULL;
90	assert(size);
91
92	buffer = pipe_buffer_create(
93		(struct pipe_screen*) screen,
94		PIPE_BIND_CUSTOM,
95		PIPE_USAGE_IMMUTABLE,
96		size);
97
98	return (struct r600_resource *)buffer;
99}
100
101
102static void evergreen_set_rat(
103	struct r600_pipe_compute *pipe,
104	int id,
105	struct r600_resource* bo,
106	int start,
107	int size)
108{
109	struct pipe_surface rat_templ;
110	struct r600_surface *surf = NULL;
111	struct r600_context *rctx = NULL;
112
113	assert(id < 12);
114	assert((size & 3) == 0);
115	assert((start & 0xFF) == 0);
116
117	rctx = pipe->ctx;
118
119	COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
120
121	/* Create the RAT surface */
122	memset(&rat_templ, 0, sizeof(rat_templ));
123	rat_templ.format = PIPE_FORMAT_R32_UINT;
124	rat_templ.u.tex.level = 0;
125	rat_templ.u.tex.first_layer = 0;
126	rat_templ.u.tex.last_layer = 0;
127
128	/* Add the RAT the list of color buffers */
129	pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
130		(struct pipe_context *)pipe->ctx,
131		(struct pipe_resource *)bo, &rat_templ);
132
133	/* Update the number of color buffers */
134	pipe->ctx->framebuffer.state.nr_cbufs =
135		MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
136
137	/* Update the cb_target_mask
138	 * XXX: I think this is a potential spot for bugs once we start doing
139	 * GL interop.  cb_target_mask may be modified in the 3D sections
140	 * of this driver. */
141	pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
142
143	surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
144	evergreen_init_color_surface_rat(rctx, surf);
145}
146
147static void evergreen_cs_set_vertex_buffer(
148	struct r600_context * rctx,
149	unsigned vb_index,
150	unsigned offset,
151	struct pipe_resource * buffer)
152{
153	struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
154	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
155	vb->stride = 1;
156	vb->buffer_offset = offset;
157	vb->buffer = buffer;
158	vb->user_buffer = NULL;
159
160	/* The vertex instructions in the compute shaders use the texture cache,
161	 * so we need to invalidate it. */
162	rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
163	state->enabled_mask |= 1 << vb_index;
164	state->dirty_mask |= 1 << vb_index;
165	state->atom.dirty = true;
166}
167
168static void evergreen_cs_set_constant_buffer(
169	struct r600_context * rctx,
170	unsigned cb_index,
171	unsigned offset,
172	unsigned size,
173	struct pipe_resource * buffer)
174{
175	struct pipe_constant_buffer cb;
176	cb.buffer_size = size;
177	cb.buffer_offset = offset;
178	cb.buffer = buffer;
179	cb.user_buffer = NULL;
180
181	rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
182}
183
184static const struct u_resource_vtbl r600_global_buffer_vtbl =
185{
186	u_default_resource_get_handle, /* get_handle */
187	r600_compute_global_buffer_destroy, /* resource_destroy */
188	r600_compute_global_transfer_map, /* transfer_map */
189	r600_compute_global_transfer_flush_region,/* transfer_flush_region */
190	r600_compute_global_transfer_unmap, /* transfer_unmap */
191	r600_compute_global_transfer_inline_write /* transfer_inline_write */
192};
193
194
195void *evergreen_create_compute_state(
196	struct pipe_context *ctx_,
197	const const struct pipe_compute_state *cso)
198{
199	struct r600_context *ctx = (struct r600_context *)ctx_;
200	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
201
202#ifdef HAVE_OPENCL
203	const struct pipe_llvm_program_header * header;
204	const unsigned char * code;
205	unsigned i;
206
207	shader->llvm_ctx = LLVMContextCreate();
208
209	COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
210
211	header = cso->prog;
212	code = cso->prog + sizeof(struct pipe_llvm_program_header);
213#endif
214
215	shader->ctx = (struct r600_context*)ctx;
216	shader->local_size = cso->req_local_mem;
217	shader->private_size = cso->req_private_mem;
218	shader->input_size = cso->req_input_mem;
219
220#ifdef HAVE_OPENCL
221	shader->num_kernels = radeon_llvm_get_num_kernels(shader->llvm_ctx, code,
222							header->num_bytes);
223	shader->kernels = CALLOC(sizeof(struct r600_kernel), shader->num_kernels);
224
225	for (i = 0; i < shader->num_kernels; i++) {
226		struct r600_kernel *kernel = &shader->kernels[i];
227		kernel->llvm_module = radeon_llvm_get_kernel_module(shader->llvm_ctx, i,
228							code, header->num_bytes);
229	}
230#endif
231	return shader;
232}
233
234void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
235{
236	struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
237
238	if (!shader)
239		return;
240
241	FREE(shader->kernels);
242
243#ifdef HAVE_OPENCL
244	if (shader->llvm_ctx){
245		LLVMContextDispose(shader->llvm_ctx);
246	}
247#endif
248
249	FREE(shader);
250}
251
252static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
253{
254	struct r600_context *ctx = (struct r600_context *)ctx_;
255
256	COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
257
258	ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
259}
260
261/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
262 * kernel parameters there are implicit parameters that need to be stored
263 * in the vertex buffer as well.  Here is how these parameters are organized in
264 * the buffer:
265 *
266 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
267 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
268 * DWORDS 6-8: Number of work items within each work group in each dimension
269 *             (x,y,z)
270 * DWORDS 9+ : Kernel parameters
271 */
272void evergreen_compute_upload_input(
273	struct pipe_context *ctx_,
274	const uint *block_layout,
275	const uint *grid_layout,
276	const void *input)
277{
278	struct r600_context *ctx = (struct r600_context *)ctx_;
279	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
280	int i;
281	/* We need to reserve 9 dwords (36 bytes) for implicit kernel
282	 * parameters.
283	 */
284	unsigned input_size = shader->input_size + 36;
285	uint32_t * num_work_groups_start;
286	uint32_t * global_size_start;
287	uint32_t * local_size_start;
288	uint32_t * kernel_parameters_start;
289	struct pipe_box box;
290	struct pipe_transfer *transfer = NULL;
291
292	if (shader->input_size == 0) {
293		return;
294	}
295
296	if (!shader->kernel_param) {
297		/* Add space for the grid dimensions */
298		shader->kernel_param = (struct r600_resource *)
299			pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
300					PIPE_USAGE_IMMUTABLE, input_size);
301	}
302
303	u_box_1d(0, input_size, &box);
304	num_work_groups_start = ctx_->transfer_map(ctx_,
305			(struct pipe_resource*)shader->kernel_param,
306			0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
307			&box, &transfer);
308	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
309	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
310	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
311
312	/* Copy the work group size */
313	memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
314
315	/* Copy the global size */
316	for (i = 0; i < 3; i++) {
317		global_size_start[i] = grid_layout[i] * block_layout[i];
318	}
319
320	/* Copy the local dimensions */
321	memcpy(local_size_start, block_layout, 3 * sizeof(uint));
322
323	/* Copy the kernel inputs */
324	memcpy(kernel_parameters_start, input, shader->input_size);
325
326	for (i = 0; i < (input_size / 4); i++) {
327		COMPUTE_DBG(ctx->screen, "input %i : %i\n", i,
328			((unsigned*)num_work_groups_start)[i]);
329	}
330
331	ctx_->transfer_unmap(ctx_, transfer);
332
333	/* ID=0 is reserved for the parameters */
334	evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
335			(struct pipe_resource*)shader->kernel_param);
336}
337
338static void evergreen_emit_direct_dispatch(
339		struct r600_context *rctx,
340		const uint *block_layout, const uint *grid_layout)
341{
342	int i;
343	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
344	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
345	unsigned num_waves;
346	unsigned num_pipes = rctx->screen->b.info.r600_max_pipes;
347	unsigned wave_divisor = (16 * num_pipes);
348	int group_size = 1;
349	int grid_size = 1;
350	unsigned lds_size = shader->local_size / 4 + shader->active_kernel->bc.nlds_dw;
351
352	/* Calculate group_size/grid_size */
353	for (i = 0; i < 3; i++) {
354		group_size *= block_layout[i];
355	}
356
357	for (i = 0; i < 3; i++)	{
358		grid_size *= grid_layout[i];
359	}
360
361	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
362	num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
363			wave_divisor - 1) / wave_divisor;
364
365	COMPUTE_DBG(rctx->screen, "Using %u pipes, "
366				"%u wavefronts per thread block, "
367				"allocating %u dwords lds.\n",
368				num_pipes, num_waves, lds_size);
369
370	r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
371
372	r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
373	radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
374	radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
375	radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
376
377	r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
378								group_size);
379
380	r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
381	radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
382	radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
383	radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
384
385	if (rctx->b.chip_class < CAYMAN) {
386		assert(lds_size <= 8192);
387	} else {
388		/* Cayman appears to have a slightly smaller limit, see the
389		 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
390		assert(lds_size <= 8160);
391	}
392
393	r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC,
394					lds_size | (num_waves << 14));
395
396	/* Dispatch packet */
397	radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
398	radeon_emit(cs, grid_layout[0]);
399	radeon_emit(cs, grid_layout[1]);
400	radeon_emit(cs, grid_layout[2]);
401	/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
402	radeon_emit(cs, 1);
403}
404
405static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
406		const uint *grid_layout)
407{
408	struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
409	int i;
410
411	/* make sure that the gfx ring is only one active */
412	if (ctx->b.rings.dma.cs) {
413		ctx->b.rings.dma.flush(ctx, RADEON_FLUSH_ASYNC);
414	}
415
416	/* Initialize all the compute-related registers.
417	 *
418	 * See evergreen_init_atom_start_compute_cs() in this file for the list
419	 * of registers initialized by the start_compute_cs_cmd atom.
420	 */
421	r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
422
423	ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
424	r600_flush_emit(ctx);
425
426	/* Emit colorbuffers. */
427	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
428	for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
429		struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
430		unsigned reloc = r600_context_bo_reloc(&ctx->b, &ctx->b.rings.gfx,
431						       (struct r600_resource*)cb->base.texture,
432						       RADEON_USAGE_READWRITE);
433
434		r600_write_compute_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
435		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
436		radeon_emit(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
437		radeon_emit(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
438		radeon_emit(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
439		radeon_emit(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
440		radeon_emit(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
441		radeon_emit(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */
442
443		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
444		radeon_emit(cs, reloc);
445
446		if (!ctx->keep_tiling_flags) {
447			radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
448			radeon_emit(cs, reloc);
449		}
450
451		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
452		radeon_emit(cs, reloc);
453	}
454	if (ctx->keep_tiling_flags) {
455		for (; i < 8 ; i++) {
456			r600_write_compute_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
457						       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
458		}
459		for (; i < 12; i++) {
460			r600_write_compute_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
461						       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
462		}
463	}
464
465	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
466	r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK,
467					ctx->compute_cb_target_mask);
468
469
470	/* Emit vertex buffer state */
471	ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
472	r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
473
474	/* Emit constant buffer state */
475	r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
476
477	/* Emit compute shader state */
478	r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
479
480	/* Emit dispatch state and dispatch packet */
481	evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
482
483	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
484	 */
485	ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
486		      R600_CONTEXT_INV_VERTEX_CACHE |
487	              R600_CONTEXT_INV_TEX_CACHE;
488	r600_flush_emit(ctx);
489	ctx->b.flags = 0;
490
491	if (ctx->b.chip_class >= CAYMAN) {
492		ctx->skip_surface_sync_on_next_cs_flush = true;
493	}
494
495#if 0
496	COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
497	for (i = 0; i < cs->cdw; i++) {
498		COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
499	}
500#endif
501
502}
503
504
505/**
506 * Emit function for r600_cs_shader_state atom
507 */
508void evergreen_emit_cs_shader(
509		struct r600_context *rctx,
510		struct r600_atom *atom)
511{
512	struct r600_cs_shader_state *state =
513					(struct r600_cs_shader_state*)atom;
514	struct r600_pipe_compute *shader = state->shader;
515	struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
516	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
517	uint64_t va;
518
519	va = r600_resource_va(&rctx->screen->b.b, &kernel->code_bo->b.b);
520
521	r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
522	radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
523	radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
524			S_0288D4_NUM_GPRS(kernel->bc.ngpr)
525			| S_0288D4_STACK_SIZE(kernel->bc.nstack));
526	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
527
528	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
529	radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx,
530							kernel->code_bo, RADEON_USAGE_READ));
531}
532
533static void evergreen_launch_grid(
534		struct pipe_context *ctx_,
535		const uint *block_layout, const uint *grid_layout,
536		uint32_t pc, const void *input)
537{
538	struct r600_context *ctx = (struct r600_context *)ctx_;
539
540	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
541	struct r600_kernel *kernel = &shader->kernels[pc];
542
543	COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);
544
545#ifdef HAVE_OPENCL
546
547	if (!kernel->code_bo) {
548		void *p;
549		struct r600_bytecode *bc = &kernel->bc;
550		LLVMModuleRef mod = kernel->llvm_module;
551		boolean use_kill = false;
552		bool dump = (ctx->screen->b.debug_flags & DBG_CS) != 0;
553		unsigned use_sb = ctx->screen->b.debug_flags & DBG_SB_CS;
554		unsigned sb_disasm = use_sb ||
555			(ctx->screen->b.debug_flags & DBG_SB_DISASM);
556
557		r600_bytecode_init(bc, ctx->b.chip_class, ctx->b.family,
558			   ctx->screen->has_compressed_msaa_texturing);
559		bc->type = TGSI_PROCESSOR_COMPUTE;
560		bc->isa = ctx->isa;
561		r600_llvm_compile(mod, ctx->b.family, bc, &use_kill, dump);
562
563		if (dump && !sb_disasm) {
564			r600_bytecode_disasm(bc);
565		} else if ((dump && sb_disasm) || use_sb) {
566			if (r600_sb_bytecode_process(ctx, bc, NULL, dump, use_sb))
567				R600_ERR("r600_sb_bytecode_process failed!\n");
568		}
569
570		kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
571							kernel->bc.ndw * 4);
572		p = r600_buffer_map_sync_with_rings(&ctx->b, kernel->code_bo, PIPE_TRANSFER_WRITE);
573		memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
574		ctx->b.ws->buffer_unmap(kernel->code_bo->cs_buf);
575	}
576#endif
577	shader->active_kernel = kernel;
578	ctx->cs_shader_state.kernel_index = pc;
579	evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
580	compute_emit_cs(ctx, block_layout, grid_layout);
581}
582
583static void evergreen_set_compute_resources(struct pipe_context * ctx_,
584		unsigned start, unsigned count,
585		struct pipe_surface ** surfaces)
586{
587	struct r600_context *ctx = (struct r600_context *)ctx_;
588	struct r600_surface **resources = (struct r600_surface **)surfaces;
589
590	COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
591			start, count);
592
593	for (int i = 0; i < count; i++)	{
594		/* The First two vertex buffers are reserved for parameters and
595		 * global buffers. */
596		unsigned vtx_id = 2 + i;
597		if (resources[i]) {
598			struct r600_resource_global *buffer =
599				(struct r600_resource_global*)
600				resources[i]->base.texture;
601			if (resources[i]->base.writable) {
602				assert(i+1 < 12);
603
604				evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
605				(struct r600_resource *)resources[i]->base.texture,
606				buffer->chunk->start_in_dw*4,
607				resources[i]->base.texture->width0);
608			}
609
610			evergreen_cs_set_vertex_buffer(ctx, vtx_id,
611					buffer->chunk->start_in_dw * 4,
612					resources[i]->base.texture);
613		}
614	}
615}
616
617void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
618		unsigned start_slot, unsigned count,
619		struct pipe_sampler_view **views)
620{
621	struct r600_pipe_sampler_view **resource =
622		(struct r600_pipe_sampler_view **)views;
623
624	for (int i = 0; i < count; i++)	{
625		if (resource[i]) {
626			assert(i+1 < 12);
627			/* XXX: Implement */
628			assert(!"Compute samplers not implemented.");
629			///FETCH0 = VTX0 (param buffer),
630			//FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
631		}
632	}
633}
634
635
636static void evergreen_set_global_binding(
637	struct pipe_context *ctx_, unsigned first, unsigned n,
638	struct pipe_resource **resources,
639	uint32_t **handles)
640{
641	struct r600_context *ctx = (struct r600_context *)ctx_;
642	struct compute_memory_pool *pool = ctx->screen->global_pool;
643	struct r600_resource_global **buffers =
644		(struct r600_resource_global **)resources;
645
646	COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
647			first, n);
648
649	if (!resources) {
650		/* XXX: Unset */
651		return;
652	}
653
654	compute_memory_finalize_pending(pool, ctx_);
655
656	for (int i = 0; i < n; i++)
657	{
658		assert(resources[i]->target == PIPE_BUFFER);
659		assert(resources[i]->bind & PIPE_BIND_GLOBAL);
660
661		*(handles[i]) = buffers[i]->chunk->start_in_dw * 4;
662	}
663
664	evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
665	evergreen_cs_set_vertex_buffer(ctx, 1, 0,
666				(struct pipe_resource*)pool->bo);
667}
668
669/**
670 * This function initializes all the compute specific registers that need to
671 * be initialized for each compute command stream.  Registers that are common
672 * to both compute and 3D will be initialized at the beginning of each compute
673 * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
674 * packet requires that the shader type bit be set, we must initialize all
675 * context registers needed for compute in this function.  The registers
676 * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
677 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
678 * on the GPU family.
679 */
680void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
681{
682	struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
683	int num_threads;
684	int num_stack_entries;
685
686	/* since all required registers are initialised in the
687	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
688	 */
689	r600_init_command_buffer(cb, 256);
690	cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
691
692	/* This must be first. */
693	r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
694	r600_store_value(cb, 0x80000000);
695	r600_store_value(cb, 0x80000000);
696
697	/* We're setting config registers here. */
698	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
699	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
700
701	switch (ctx->b.family) {
702	case CHIP_CEDAR:
703	default:
704		num_threads = 128;
705		num_stack_entries = 256;
706		break;
707	case CHIP_REDWOOD:
708		num_threads = 128;
709		num_stack_entries = 256;
710		break;
711	case CHIP_JUNIPER:
712		num_threads = 128;
713		num_stack_entries = 512;
714		break;
715	case CHIP_CYPRESS:
716	case CHIP_HEMLOCK:
717		num_threads = 128;
718		num_stack_entries = 512;
719		break;
720	case CHIP_PALM:
721		num_threads = 128;
722		num_stack_entries = 256;
723		break;
724	case CHIP_SUMO:
725		num_threads = 128;
726		num_stack_entries = 256;
727		break;
728	case CHIP_SUMO2:
729		num_threads = 128;
730		num_stack_entries = 512;
731		break;
732	case CHIP_BARTS:
733		num_threads = 128;
734		num_stack_entries = 512;
735		break;
736	case CHIP_TURKS:
737		num_threads = 128;
738		num_stack_entries = 256;
739		break;
740	case CHIP_CAICOS:
741		num_threads = 128;
742		num_stack_entries = 256;
743		break;
744	}
745
746	/* Config Registers */
747	if (ctx->b.chip_class < CAYMAN)
748		evergreen_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
749					   ctx->screen->b.info.drm_minor);
750	else
751		cayman_init_common_regs(cb, ctx->b.chip_class, ctx->b.family,
752					ctx->screen->b.info.drm_minor);
753
754	/* The primitive type always needs to be POINTLIST for compute. */
755	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
756						V_008958_DI_PT_POINTLIST);
757
758	if (ctx->b.chip_class < CAYMAN) {
759
760		/* These registers control which simds can be used by each stage.
761		 * The default for these registers is 0xffffffff, which means
762		 * all simds are available for each stage.  It's possible we may
763		 * want to play around with these in the future, but for now
764		 * the default value is fine.
765		 *
766		 * R_008E20_SQ_STATIC_THREAD_MGMT1
767		 * R_008E24_SQ_STATIC_THREAD_MGMT2
768		 * R_008E28_SQ_STATIC_THREAD_MGMT3
769		 */
770
771		/* XXX: We may need to adjust the thread and stack resouce
772		 * values for 3D/compute interop */
773
774		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
775
776		/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
777		 * Set the number of threads used by the PS/VS/GS/ES stage to
778		 * 0.
779		 */
780		r600_store_value(cb, 0);
781
782		/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
783		 * Set the number of threads used by the CS (aka LS) stage to
784		 * the maximum number of threads and set the number of threads
785		 * for the HS stage to 0. */
786		r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
787
788		/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
789		 * Set the Control Flow stack entries to 0 for PS/VS stages */
790		r600_store_value(cb, 0);
791
792		/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
793		 * Set the Control Flow stack entries to 0 for GS/ES stages */
794		r600_store_value(cb, 0);
795
796		/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
797		 * Set the Contol Flow stack entries to 0 for the HS stage, and
798		 * set it to the maximum value for the CS (aka LS) stage. */
799		r600_store_value(cb,
800			S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
801	}
802	/* Give the compute shader all the available LDS space.
803	 * NOTE: This only sets the maximum number of dwords that a compute
804	 * shader can allocate.  When a shader is executed, we still need to
805	 * allocate the appropriate amount of LDS dwords using the
806	 * CM_R_0288E8_SQ_LDS_ALLOC register.
807	 */
808	if (ctx->b.chip_class < CAYMAN) {
809		r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
810			S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
811	} else {
812		r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
813			S_0286FC_NUM_PS_LDS(0) |
814			S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
815	}
816
817	/* Context Registers */
818
819	if (ctx->b.chip_class < CAYMAN) {
820		/* workaround for hw issues with dyn gpr - must set all limits
821		 * to 240 instead of 0, 0x1e == 240 / 8
822		 */
823		r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
824				S_028838_PS_GPRS(0x1e) |
825				S_028838_VS_GPRS(0x1e) |
826				S_028838_GS_GPRS(0x1e) |
827				S_028838_ES_GPRS(0x1e) |
828				S_028838_HS_GPRS(0x1e) |
829				S_028838_LS_GPRS(0x1e));
830	}
831
832	/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
833	r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
834		S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
835
836	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
837
838	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
839						S_0286E8_TID_IN_GROUP_ENA
840						| S_0286E8_TGID_ENA
841						| S_0286E8_DISABLE_INDEX_PACK)
842						;
843
844	/* The LOOP_CONST registers are an optimizations for loops that allows
845	 * you to store the initial counter, increment value, and maximum
846	 * counter value in a register so that hardware can calculate the
847	 * correct number of iterations for the loop, so that you don't need
848	 * to have the loop counter in your shader code.  We don't currently use
849	 * this optimization, so we must keep track of the counter in the
850	 * shader and use a break instruction to exit loops.  However, the
851	 * hardware will still uses this register to determine when to exit a
852	 * loop, so we need to initialize the counter to 0, set the increment
853	 * value to 1 and the maximum counter value to the 4095 (0xfff) which
854	 * is the maximum value allowed.  This gives us a maximum of 4096
855	 * iterations for our loops, but hopefully our break instruction will
856	 * execute before some time before the 4096th iteration.
857	 */
858	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
859}
860
861void evergreen_init_compute_state_functions(struct r600_context *ctx)
862{
863	ctx->b.b.create_compute_state = evergreen_create_compute_state;
864	ctx->b.b.delete_compute_state = evergreen_delete_compute_state;
865	ctx->b.b.bind_compute_state = evergreen_bind_compute_state;
866//	 ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
867	ctx->b.b.set_compute_resources = evergreen_set_compute_resources;
868	ctx->b.b.set_global_binding = evergreen_set_global_binding;
869	ctx->b.b.launch_grid = evergreen_launch_grid;
870
871	/* We always use at least one vertex buffer for parameters (id = 1)*/
872	ctx->cs_vertex_buffer_state.enabled_mask =
873	ctx->cs_vertex_buffer_state.dirty_mask = 0x2;
874}
875
876struct pipe_resource *r600_compute_global_buffer_create(
877	struct pipe_screen *screen,
878	const struct pipe_resource *templ)
879{
880	struct r600_resource_global* result = NULL;
881	struct r600_screen* rscreen = NULL;
882	int size_in_dw = 0;
883
884	assert(templ->target == PIPE_BUFFER);
885	assert(templ->bind & PIPE_BIND_GLOBAL);
886	assert(templ->array_size == 1 || templ->array_size == 0);
887	assert(templ->depth0 == 1 || templ->depth0 == 0);
888	assert(templ->height0 == 1 || templ->height0 == 0);
889
890	result = (struct r600_resource_global*)
891	CALLOC(sizeof(struct r600_resource_global), 1);
892	rscreen = (struct r600_screen*)screen;
893
894	COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
895	COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
896			templ->array_size);
897
898	result->base.b.vtbl = &r600_global_buffer_vtbl;
899	result->base.b.b.screen = screen;
900	result->base.b.b = *templ;
901	pipe_reference_init(&result->base.b.b.reference, 1);
902
903	size_in_dw = (templ->width0+3) / 4;
904
905	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
906
907	if (result->chunk == NULL)
908	{
909		free(result);
910		return NULL;
911	}
912
913	return &result->base.b.b;
914}
915
916void r600_compute_global_buffer_destroy(
917	struct pipe_screen *screen,
918	struct pipe_resource *res)
919{
920	struct r600_resource_global* buffer = NULL;
921	struct r600_screen* rscreen = NULL;
922
923	assert(res->target == PIPE_BUFFER);
924	assert(res->bind & PIPE_BIND_GLOBAL);
925
926	buffer = (struct r600_resource_global*)res;
927	rscreen = (struct r600_screen*)screen;
928
929	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
930
931	buffer->chunk = NULL;
932	free(res);
933}
934
935void *r600_compute_global_transfer_map(
936	struct pipe_context *ctx_,
937	struct pipe_resource *resource,
938	unsigned level,
939	unsigned usage,
940	const struct pipe_box *box,
941	struct pipe_transfer **ptransfer)
942{
943	struct r600_context *rctx = (struct r600_context*)ctx_;
944	struct compute_memory_pool *pool = rctx->screen->global_pool;
945	struct r600_resource_global* buffer =
946		(struct r600_resource_global*)resource;
947
948	COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
949			"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
950			"width = %u, height = %u, depth = %u)\n", level, usage,
951			box->x, box->y, box->z, box->width, box->height,
952			box->depth);
953	COMPUTE_DBG(rctx->screen, "Buffer id = %u offset = "
954		"%u (box.x)\n", buffer->chunk->id, box->x);
955
956
957	compute_memory_finalize_pending(pool, ctx_);
958
959	assert(resource->target == PIPE_BUFFER);
960	assert(resource->bind & PIPE_BIND_GLOBAL);
961	assert(box->x >= 0);
962	assert(box->y == 0);
963	assert(box->z == 0);
964
965	///TODO: do it better, mapping is not possible if the pool is too big
966	return pipe_buffer_map_range(ctx_, (struct pipe_resource*)buffer->chunk->pool->bo,
967			box->x + (buffer->chunk->start_in_dw * 4),
968			box->width, usage, ptransfer);
969}
970
971void r600_compute_global_transfer_unmap(
972	struct pipe_context *ctx_,
973	struct pipe_transfer* transfer)
974{
975	/* struct r600_resource_global are not real resources, they just map
976	 * to an offset within the compute memory pool.  The function
977	 * r600_compute_global_transfer_map() maps the memory pool
978	 * resource rather than the struct r600_resource_global passed to
979	 * it as an argument and then initalizes ptransfer->resource with
980	 * the memory pool resource (via pipe_buffer_map_range).
981	 * When transfer_unmap is called it uses the memory pool's
982	 * vtable which calls r600_buffer_transfer_map() rather than
983	 * this function.
984	 */
985	assert (!"This function should not be called");
986}
987
988void r600_compute_global_transfer_flush_region(
989	struct pipe_context *ctx_,
990	struct pipe_transfer *transfer,
991	const struct pipe_box *box)
992{
993	assert(0 && "TODO");
994}
995
996void r600_compute_global_transfer_inline_write(
997	struct pipe_context *pipe,
998	struct pipe_resource *resource,
999	unsigned level,
1000	unsigned usage,
1001	const struct pipe_box *box,
1002	const void *data,
1003	unsigned stride,
1004	unsigned layer_stride)
1005{
1006	assert(0 && "TODO");
1007}
1008