evergreen_compute.c revision 9827fc3f038f551dccc64e3addfaf87e724408f9
1/*
2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 *      Adam Rak <adam.rak@streamnovation.com>
25 */
26
27#include <stdio.h>
28#include <errno.h>
29#include "pipe/p_defines.h"
30#include "pipe/p_state.h"
31#include "pipe/p_context.h"
32#include "util/u_blitter.h"
33#include "util/list.h"
34#include "util/u_transfer.h"
35#include "util/u_surface.h"
36#include "util/u_pack_color.h"
37#include "util/u_memory.h"
38#include "util/u_inlines.h"
39#include "util/u_framebuffer.h"
40#include "pipebuffer/pb_buffer.h"
41#include "evergreend.h"
42#include "r600_shader.h"
43#include "r600_pipe.h"
44#include "r600_formats.h"
45#include "evergreen_compute.h"
46#include "evergreen_compute_internal.h"
47#include "compute_memory_pool.h"
48#include "sb/sb_public.h"
49#ifdef HAVE_OPENCL
50#include "radeon/radeon_llvm_util.h"
51#endif
52#include "radeon/radeon_elf_util.h"
53#include <inttypes.h>
54
55/**
56RAT0 is for global binding write
57VTX1 is for global binding read
58
59for wrting images RAT1...
60for reading images TEX2...
61  TEX2-RAT1 is paired
62
63TEX2... consumes the same fetch resources, that VTX2... would consume
64
65CONST0 and VTX0 is for parameters
66  CONST0 is binding smaller input parameter buffer, and for constant indexing,
67  also constant cached
68  VTX0 is for indirect/non-constant indexing, or if the input is bigger than
69  the constant cache can handle
70
71RAT-s are limited to 12, so we can only bind at most 11 texture for writing
72because we reserve RAT0 for global bindings. With byteaddressing enabled,
73we should reserve another one too.=> 10 image binding for writing max.
74
75from Nvidia OpenCL:
76  CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
77  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
78
79so 10 for writing is enough. 176 is the max for reading according to the docs
80
81writable images should be listed first < 10, so their id corresponds to RAT(id+1)
82writable images will consume TEX slots, VTX slots too because of linear indexing
83
84*/
85
86struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
87						     unsigned size)
88{
89	struct pipe_resource *buffer = NULL;
90	assert(size);
91
92	buffer = pipe_buffer_create((struct pipe_screen*) screen,
93				    PIPE_BIND_CUSTOM,
94				    PIPE_USAGE_IMMUTABLE,
95				    size);
96
97	return (struct r600_resource *)buffer;
98}
99
100
101static void evergreen_set_rat(struct r600_pipe_compute *pipe,
102			      unsigned id,
103			      struct r600_resource *bo,
104			      int start,
105			      int size)
106{
107	struct pipe_surface rat_templ;
108	struct r600_surface *surf = NULL;
109	struct r600_context *rctx = NULL;
110
111	assert(id < 12);
112	assert((size & 3) == 0);
113	assert((start & 0xFF) == 0);
114
115	rctx = pipe->ctx;
116
117	COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
118
119	/* Create the RAT surface */
120	memset(&rat_templ, 0, sizeof(rat_templ));
121	rat_templ.format = PIPE_FORMAT_R32_UINT;
122	rat_templ.u.tex.level = 0;
123	rat_templ.u.tex.first_layer = 0;
124	rat_templ.u.tex.last_layer = 0;
125
126	/* Add the RAT the list of color buffers */
127	pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
128		(struct pipe_context *)pipe->ctx,
129		(struct pipe_resource *)bo, &rat_templ);
130
131	/* Update the number of color buffers */
132	pipe->ctx->framebuffer.state.nr_cbufs =
133		MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
134
135	/* Update the cb_target_mask
136	 * XXX: I think this is a potential spot for bugs once we start doing
137	 * GL interop.  cb_target_mask may be modified in the 3D sections
138	 * of this driver. */
139	pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
140
141	surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
142	evergreen_init_color_surface_rat(rctx, surf);
143}
144
145static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
146					   unsigned vb_index,
147					   unsigned offset,
148					   struct pipe_resource *buffer)
149{
150	struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
151	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
152	vb->stride = 1;
153	vb->buffer_offset = offset;
154	vb->buffer = buffer;
155	vb->user_buffer = NULL;
156
157	/* The vertex instructions in the compute shaders use the texture cache,
158	 * so we need to invalidate it. */
159	rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
160	state->enabled_mask |= 1 << vb_index;
161	state->dirty_mask |= 1 << vb_index;
162	r600_mark_atom_dirty(rctx, &state->atom);
163}
164
165static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
166					     unsigned cb_index,
167					     unsigned offset,
168					     unsigned size,
169					     struct pipe_resource *buffer)
170{
171	struct pipe_constant_buffer cb;
172	cb.buffer_size = size;
173	cb.buffer_offset = offset;
174	cb.buffer = buffer;
175	cb.user_buffer = NULL;
176
177	rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
178}
179
180/* We need to define these R600 registers here, because we can't include
181 * evergreend.h and r600d.h.
182 */
183#define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
184#define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
185
186#ifdef HAVE_OPENCL
187
188static void r600_shader_binary_read_config(const struct radeon_shader_binary *binary,
189					   struct r600_bytecode *bc,
190					   uint64_t symbol_offset,
191					   boolean *use_kill)
192{
193       unsigned i;
194       const unsigned char *config =
195               radeon_shader_binary_config_start(binary, symbol_offset);
196
197       for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
198               unsigned reg =
199                       util_le32_to_cpu(*(uint32_t*)(config + i));
200               unsigned value =
201                       util_le32_to_cpu(*(uint32_t*)(config + i + 4));
202               switch (reg) {
203               /* R600 / R700 */
204               case R_028850_SQ_PGM_RESOURCES_PS:
205               case R_028868_SQ_PGM_RESOURCES_VS:
206               /* Evergreen / Northern Islands */
207               case R_028844_SQ_PGM_RESOURCES_PS:
208               case R_028860_SQ_PGM_RESOURCES_VS:
209               case R_0288D4_SQ_PGM_RESOURCES_LS:
210                       bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
211                       bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
212                       break;
213               case R_02880C_DB_SHADER_CONTROL:
214                       *use_kill = G_02880C_KILL_ENABLE(value);
215                       break;
216               case R_0288E8_SQ_LDS_ALLOC:
217                       bc->nlds_dw = value;
218                       break;
219               }
220       }
221}
222
223static unsigned r600_create_shader(struct r600_bytecode *bc,
224				   const struct radeon_shader_binary *binary,
225				   boolean *use_kill)
226
227{
228	assert(binary->code_size % 4 == 0);
229	bc->bytecode = CALLOC(1, binary->code_size);
230	memcpy(bc->bytecode, binary->code, binary->code_size);
231	bc->ndw = binary->code_size / 4;
232
233	r600_shader_binary_read_config(binary, bc, 0, use_kill);
234	return 0;
235}
236
237#endif
238
239static void r600_destroy_shader(struct r600_bytecode *bc)
240{
241	FREE(bc->bytecode);
242}
243
244static void *evergreen_create_compute_state(struct pipe_context *ctx,
245					    const struct pipe_compute_state *cso)
246{
247	struct r600_context *rctx = (struct r600_context *)ctx;
248	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
249#ifdef HAVE_OPENCL
250	const struct pipe_llvm_program_header *header;
251	const char *code;
252	void *p;
253	boolean use_kill;
254
255	COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
256	header = cso->prog;
257	code = cso->prog + sizeof(struct pipe_llvm_program_header);
258	radeon_shader_binary_init(&shader->binary);
259	radeon_elf_read(code, header->num_bytes, &shader->binary);
260	r600_create_shader(&shader->bc, &shader->binary, &use_kill);
261
262	/* Upload code + ROdata */
263	shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
264							shader->bc.ndw * 4);
265	p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
266	//TODO: use util_memcpy_cpu_to_le32 ?
267	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
268	rctx->b.ws->buffer_unmap(shader->code_bo->buf);
269#endif
270
271	shader->ctx = rctx;
272	shader->local_size = cso->req_local_mem;
273	shader->private_size = cso->req_private_mem;
274	shader->input_size = cso->req_input_mem;
275
276	return shader;
277}
278
279static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
280{
281	struct r600_context *rctx = (struct r600_context *)ctx;
282	struct r600_pipe_compute *shader = state;
283
284	COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
285
286	if (!shader)
287		return;
288
289	radeon_shader_binary_clean(&shader->binary);
290	r600_destroy_shader(&shader->bc);
291
292	/* TODO destroy shader->code_bo, shader->const_bo
293	 * we'll need something like r600_buffer_free */
294	FREE(shader);
295}
296
297static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
298{
299	struct r600_context *rctx = (struct r600_context *)ctx;
300
301	COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
302
303	rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
304}
305
306/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
307 * kernel parameters there are implicit parameters that need to be stored
308 * in the vertex buffer as well.  Here is how these parameters are organized in
309 * the buffer:
310 *
311 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
312 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
313 * DWORDS 6-8: Number of work items within each work group in each dimension
314 *             (x,y,z)
315 * DWORDS 9+ : Kernel parameters
316 */
317static void evergreen_compute_upload_input(struct pipe_context *ctx,
318					   const struct pipe_grid_info *info)
319{
320	struct r600_context *rctx = (struct r600_context *)ctx;
321	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
322	unsigned i;
323	/* We need to reserve 9 dwords (36 bytes) for implicit kernel
324	 * parameters.
325	 */
326	unsigned input_size = shader->input_size + 36;
327	uint32_t *num_work_groups_start;
328	uint32_t *global_size_start;
329	uint32_t *local_size_start;
330	uint32_t *kernel_parameters_start;
331	struct pipe_box box;
332	struct pipe_transfer *transfer = NULL;
333
334	if (shader->input_size == 0) {
335		return;
336	}
337
338	if (!shader->kernel_param) {
339		/* Add space for the grid dimensions */
340		shader->kernel_param = (struct r600_resource *)
341			pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM,
342					PIPE_USAGE_IMMUTABLE, input_size);
343	}
344
345	u_box_1d(0, input_size, &box);
346	num_work_groups_start = ctx->transfer_map(ctx,
347			(struct pipe_resource*)shader->kernel_param,
348			0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
349			&box, &transfer);
350	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
351	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
352	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
353
354	/* Copy the work group size */
355	memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
356
357	/* Copy the global size */
358	for (i = 0; i < 3; i++) {
359		global_size_start[i] = info->grid[i] * info->block[i];
360	}
361
362	/* Copy the local dimensions */
363	memcpy(local_size_start, info->block, 3 * sizeof(uint));
364
365	/* Copy the kernel inputs */
366	memcpy(kernel_parameters_start, info->input, shader->input_size);
367
368	for (i = 0; i < (input_size / 4); i++) {
369		COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
370			((unsigned*)num_work_groups_start)[i]);
371	}
372
373	ctx->transfer_unmap(ctx, transfer);
374
375	/* ID=0 is reserved for the parameters */
376	evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
377			(struct pipe_resource*)shader->kernel_param);
378}
379
380static void evergreen_emit_dispatch(struct r600_context *rctx,
381				    const struct pipe_grid_info *info)
382{
383	int i;
384	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
385	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
386	unsigned num_waves;
387	unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
388	unsigned wave_divisor = (16 * num_pipes);
389	int group_size = 1;
390	int grid_size = 1;
391	unsigned lds_size = shader->local_size / 4 +
392		shader->bc.nlds_dw;
393
394
395	/* Calculate group_size/grid_size */
396	for (i = 0; i < 3; i++) {
397		group_size *= info->block[i];
398	}
399
400	for (i = 0; i < 3; i++)	{
401		grid_size *= info->grid[i];
402	}
403
404	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
405	num_waves = (info->block[0] * info->block[1] * info->block[2] +
406			wave_divisor - 1) / wave_divisor;
407
408	COMPUTE_DBG(rctx->screen, "Using %u pipes, "
409				"%u wavefronts per thread block, "
410				"allocating %u dwords lds.\n",
411				num_pipes, num_waves, lds_size);
412
413	radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
414
415	radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
416	radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
417	radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
418	radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
419
420	radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
421								group_size);
422
423	radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
424	radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
425	radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
426	radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
427
428	if (rctx->b.chip_class < CAYMAN) {
429		assert(lds_size <= 8192);
430	} else {
431		/* Cayman appears to have a slightly smaller limit, see the
432		 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
433		assert(lds_size <= 8160);
434	}
435
436	radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
437					lds_size | (num_waves << 14));
438
439	/* Dispatch packet */
440	radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
441	radeon_emit(cs, info->grid[0]);
442	radeon_emit(cs, info->grid[1]);
443	radeon_emit(cs, info->grid[2]);
444	/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
445	radeon_emit(cs, 1);
446}
447
448static void compute_emit_cs(struct r600_context *rctx,
449			    const struct pipe_grid_info *info)
450{
451	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
452	unsigned i;
453
454	/* make sure that the gfx ring is only one active */
455	if (radeon_emitted(rctx->b.dma.cs, 0)) {
456		rctx->b.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
457	}
458
459	/* Initialize all the compute-related registers.
460	 *
461	 * See evergreen_init_atom_start_compute_cs() in this file for the list
462	 * of registers initialized by the start_compute_cs_cmd atom.
463	 */
464	r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
465
466	/* emit config state */
467	if (rctx->b.chip_class == EVERGREEN)
468		r600_emit_atom(rctx, &rctx->config_state.atom);
469
470	rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
471	r600_flush_emit(rctx);
472
473	/* Emit colorbuffers. */
474	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
475	for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
476		struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
477		unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
478						       (struct r600_resource*)cb->base.texture,
479						       RADEON_USAGE_READWRITE,
480						       RADEON_PRIO_SHADER_RW_BUFFER);
481
482		radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
483		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
484		radeon_emit(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
485		radeon_emit(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
486		radeon_emit(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
487		radeon_emit(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
488		radeon_emit(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
489		radeon_emit(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */
490
491		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
492		radeon_emit(cs, reloc);
493
494		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
495		radeon_emit(cs, reloc);
496	}
497	for (; i < 8 ; i++)
498		radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
499					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
500	for (; i < 12; i++)
501		radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
502					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
503
504	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
505	radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
506					rctx->compute_cb_target_mask);
507
508
509	/* Emit vertex buffer state */
510	rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
511	r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
512
513	/* Emit constant buffer state */
514	r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
515
516	/* Emit sampler state */
517	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
518
519	/* Emit sampler view (texture resource) state */
520	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
521
522	/* Emit compute shader state */
523	r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
524
525	/* Emit dispatch state and dispatch packet */
526	evergreen_emit_dispatch(rctx, info);
527
528	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
529	 */
530	rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
531		      R600_CONTEXT_INV_VERTEX_CACHE |
532	              R600_CONTEXT_INV_TEX_CACHE;
533	r600_flush_emit(rctx);
534	rctx->b.flags = 0;
535
536	if (rctx->b.chip_class >= CAYMAN) {
537		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
538		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
539		/* DEALLOC_STATE prevents the GPU from hanging when a
540		 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
541		 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
542		 */
543		radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
544		radeon_emit(cs, 0);
545	}
546
547#if 0
548	COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
549	for (i = 0; i < cs->cdw; i++) {
550		COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
551	}
552#endif
553
554}
555
556
557/**
558 * Emit function for r600_cs_shader_state atom
559 */
560void evergreen_emit_cs_shader(struct r600_context *rctx,
561			      struct r600_atom *atom)
562{
563	struct r600_cs_shader_state *state =
564					(struct r600_cs_shader_state*)atom;
565	struct r600_pipe_compute *shader = state->shader;
566	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
567	uint64_t va;
568	struct r600_resource *code_bo;
569	unsigned ngpr, nstack;
570
571	code_bo = shader->code_bo;
572	va = shader->code_bo->gpu_address + state->pc;
573	ngpr = shader->bc.ngpr;
574	nstack = shader->bc.nstack;
575
576	radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
577	radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
578	radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
579			S_0288D4_NUM_GPRS(ngpr)
580			| S_0288D4_STACK_SIZE(nstack));
581	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
582
583	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
584	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
585					      code_bo, RADEON_USAGE_READ,
586					      RADEON_PRIO_USER_SHADER));
587}
588
589static void evergreen_launch_grid(struct pipe_context *ctx,
590				  const struct pipe_grid_info *info)
591{
592	struct r600_context *rctx = (struct r600_context *)ctx;
593#ifdef HAVE_OPENCL
594	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
595	boolean use_kill;
596
597	rctx->cs_shader_state.pc = info->pc;
598	/* Get the config information for this kernel. */
599	r600_shader_binary_read_config(&shader->binary, &shader->bc,
600                                  info->pc, &use_kill);
601#endif
602
603	COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
604
605
606	evergreen_compute_upload_input(ctx, info);
607	compute_emit_cs(rctx, info);
608}
609
610static void evergreen_set_compute_resources(struct pipe_context *ctx,
611					    unsigned start, unsigned count,
612					    struct pipe_surface **surfaces)
613{
614	struct r600_context *rctx = (struct r600_context *)ctx;
615	struct r600_surface **resources = (struct r600_surface **)surfaces;
616
617	COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
618			start, count);
619
620	for (unsigned i = 0; i < count; i++) {
621		/* The First three vertex buffers are reserved for parameters and
622		 * global buffers. */
623		unsigned vtx_id = 3 + i;
624		if (resources[i]) {
625			struct r600_resource_global *buffer =
626				(struct r600_resource_global*)
627				resources[i]->base.texture;
628			if (resources[i]->base.writable) {
629				assert(i+1 < 12);
630
631				evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
632				(struct r600_resource *)resources[i]->base.texture,
633				buffer->chunk->start_in_dw*4,
634				resources[i]->base.texture->width0);
635			}
636
637			evergreen_cs_set_vertex_buffer(rctx, vtx_id,
638					buffer->chunk->start_in_dw * 4,
639					resources[i]->base.texture);
640		}
641	}
642}
643
644static void evergreen_set_global_binding(struct pipe_context *ctx,
645					 unsigned first, unsigned n,
646					 struct pipe_resource **resources,
647					 uint32_t **handles)
648{
649	struct r600_context *rctx = (struct r600_context *)ctx;
650	struct compute_memory_pool *pool = rctx->screen->global_pool;
651	struct r600_resource_global **buffers =
652		(struct r600_resource_global **)resources;
653	unsigned i;
654
655	COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
656			first, n);
657
658	if (!resources) {
659		/* XXX: Unset */
660		return;
661	}
662
663	/* We mark these items for promotion to the pool if they
664	 * aren't already there */
665	for (i = first; i < first + n; i++) {
666		struct compute_memory_item *item = buffers[i]->chunk;
667
668		if (!is_item_in_pool(item))
669			buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
670	}
671
672	if (compute_memory_finalize_pending(pool, ctx) == -1) {
673		/* XXX: Unset */
674		return;
675	}
676
677	for (i = first; i < first + n; i++)
678	{
679		uint32_t buffer_offset;
680		uint32_t handle;
681		assert(resources[i]->target == PIPE_BUFFER);
682		assert(resources[i]->bind & PIPE_BIND_GLOBAL);
683
684		buffer_offset = util_le32_to_cpu(*(handles[i]));
685		handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
686
687		*(handles[i]) = util_cpu_to_le32(handle);
688	}
689
690	/* globals for writing */
691	evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
692	/* globals for reading */
693	evergreen_cs_set_vertex_buffer(rctx, 1, 0,
694				(struct pipe_resource*)pool->bo);
695
696	/* constants for reading, LLVM puts them in text segment */
697	evergreen_cs_set_vertex_buffer(rctx, 2, 0,
698				(struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
699}
700
701/**
702 * This function initializes all the compute specific registers that need to
703 * be initialized for each compute command stream.  Registers that are common
704 * to both compute and 3D will be initialized at the beginning of each compute
705 * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
706 * packet requires that the shader type bit be set, we must initialize all
707 * context registers needed for compute in this function.  The registers
708 * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
709 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
710 * on the GPU family.
711 */
712void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
713{
714	struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
715	int num_threads;
716	int num_stack_entries;
717
718	/* since all required registers are initialized in the
719	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
720	 */
721	r600_init_command_buffer(cb, 256);
722	cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
723
724	/* This must be first. */
725	r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
726	r600_store_value(cb, 0x80000000);
727	r600_store_value(cb, 0x80000000);
728
729	/* We're setting config registers here. */
730	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
731	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
732
733	switch (rctx->b.family) {
734	case CHIP_CEDAR:
735	default:
736		num_threads = 128;
737		num_stack_entries = 256;
738		break;
739	case CHIP_REDWOOD:
740		num_threads = 128;
741		num_stack_entries = 256;
742		break;
743	case CHIP_JUNIPER:
744		num_threads = 128;
745		num_stack_entries = 512;
746		break;
747	case CHIP_CYPRESS:
748	case CHIP_HEMLOCK:
749		num_threads = 128;
750		num_stack_entries = 512;
751		break;
752	case CHIP_PALM:
753		num_threads = 128;
754		num_stack_entries = 256;
755		break;
756	case CHIP_SUMO:
757		num_threads = 128;
758		num_stack_entries = 256;
759		break;
760	case CHIP_SUMO2:
761		num_threads = 128;
762		num_stack_entries = 512;
763		break;
764	case CHIP_BARTS:
765		num_threads = 128;
766		num_stack_entries = 512;
767		break;
768	case CHIP_TURKS:
769		num_threads = 128;
770		num_stack_entries = 256;
771		break;
772	case CHIP_CAICOS:
773		num_threads = 128;
774		num_stack_entries = 256;
775		break;
776	}
777
778	/* Config Registers */
779	if (rctx->b.chip_class < CAYMAN)
780		evergreen_init_common_regs(rctx, cb, rctx->b.chip_class, rctx->b.family,
781					   rctx->screen->b.info.drm_minor);
782	else
783		cayman_init_common_regs(cb, rctx->b.chip_class, rctx->b.family,
784					rctx->screen->b.info.drm_minor);
785
786	/* The primitive type always needs to be POINTLIST for compute. */
787	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
788						V_008958_DI_PT_POINTLIST);
789
790	if (rctx->b.chip_class < CAYMAN) {
791
792		/* These registers control which simds can be used by each stage.
793		 * The default for these registers is 0xffffffff, which means
794		 * all simds are available for each stage.  It's possible we may
795		 * want to play around with these in the future, but for now
796		 * the default value is fine.
797		 *
798		 * R_008E20_SQ_STATIC_THREAD_MGMT1
799		 * R_008E24_SQ_STATIC_THREAD_MGMT2
800		 * R_008E28_SQ_STATIC_THREAD_MGMT3
801		 */
802
803		/* XXX: We may need to adjust the thread and stack resource
804		 * values for 3D/compute interop */
805
806		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
807
808		/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
809		 * Set the number of threads used by the PS/VS/GS/ES stage to
810		 * 0.
811		 */
812		r600_store_value(cb, 0);
813
814		/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
815		 * Set the number of threads used by the CS (aka LS) stage to
816		 * the maximum number of threads and set the number of threads
817		 * for the HS stage to 0. */
818		r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
819
820		/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
821		 * Set the Control Flow stack entries to 0 for PS/VS stages */
822		r600_store_value(cb, 0);
823
824		/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
825		 * Set the Control Flow stack entries to 0 for GS/ES stages */
826		r600_store_value(cb, 0);
827
828		/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
829		 * Set the Contol Flow stack entries to 0 for the HS stage, and
830		 * set it to the maximum value for the CS (aka LS) stage. */
831		r600_store_value(cb,
832			S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
833	}
834	/* Give the compute shader all the available LDS space.
835	 * NOTE: This only sets the maximum number of dwords that a compute
836	 * shader can allocate.  When a shader is executed, we still need to
837	 * allocate the appropriate amount of LDS dwords using the
838	 * CM_R_0288E8_SQ_LDS_ALLOC register.
839	 */
840	if (rctx->b.chip_class < CAYMAN) {
841		r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
842			S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
843	} else {
844		r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
845			S_0286FC_NUM_PS_LDS(0) |
846			S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
847	}
848
849	/* Context Registers */
850
851	if (rctx->b.chip_class < CAYMAN) {
852		/* workaround for hw issues with dyn gpr - must set all limits
853		 * to 240 instead of 0, 0x1e == 240 / 8
854		 */
855		r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
856				S_028838_PS_GPRS(0x1e) |
857				S_028838_VS_GPRS(0x1e) |
858				S_028838_GS_GPRS(0x1e) |
859				S_028838_ES_GPRS(0x1e) |
860				S_028838_HS_GPRS(0x1e) |
861				S_028838_LS_GPRS(0x1e));
862	}
863
864	/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
865	r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
866		S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
867
868	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
869
870	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
871						S_0286E8_TID_IN_GROUP_ENA
872						| S_0286E8_TGID_ENA
873						| S_0286E8_DISABLE_INDEX_PACK)
874						;
875
876	/* The LOOP_CONST registers are an optimizations for loops that allows
877	 * you to store the initial counter, increment value, and maximum
878	 * counter value in a register so that hardware can calculate the
879	 * correct number of iterations for the loop, so that you don't need
880	 * to have the loop counter in your shader code.  We don't currently use
881	 * this optimization, so we must keep track of the counter in the
882	 * shader and use a break instruction to exit loops.  However, the
883	 * hardware will still uses this register to determine when to exit a
884	 * loop, so we need to initialize the counter to 0, set the increment
885	 * value to 1 and the maximum counter value to the 4095 (0xfff) which
886	 * is the maximum value allowed.  This gives us a maximum of 4096
887	 * iterations for our loops, but hopefully our break instruction will
888	 * execute before some time before the 4096th iteration.
889	 */
890	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
891}
892
893void evergreen_init_compute_state_functions(struct r600_context *rctx)
894{
895	rctx->b.b.create_compute_state = evergreen_create_compute_state;
896	rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
897	rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
898//	 rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
899	rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
900	rctx->b.b.set_global_binding = evergreen_set_global_binding;
901	rctx->b.b.launch_grid = evergreen_launch_grid;
902
903}
904
905static void *r600_compute_global_transfer_map(struct pipe_context *ctx,
906					      struct pipe_resource *resource,
907					      unsigned level,
908					      unsigned usage,
909					      const struct pipe_box *box,
910					      struct pipe_transfer **ptransfer)
911{
912	struct r600_context *rctx = (struct r600_context*)ctx;
913	struct compute_memory_pool *pool = rctx->screen->global_pool;
914	struct r600_resource_global* buffer =
915		(struct r600_resource_global*)resource;
916
917	struct compute_memory_item *item = buffer->chunk;
918	struct pipe_resource *dst = NULL;
919	unsigned offset = box->x;
920
921	if (is_item_in_pool(item)) {
922		compute_memory_demote_item(pool, item, ctx);
923	}
924	else {
925		if (item->real_buffer == NULL) {
926			item->real_buffer =
927					r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
928		}
929	}
930
931	dst = (struct pipe_resource*)item->real_buffer;
932
933	if (usage & PIPE_TRANSFER_READ)
934		buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
935
936	COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
937			"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
938			"width = %u, height = %u, depth = %u)\n", level, usage,
939			box->x, box->y, box->z, box->width, box->height,
940			box->depth);
941	COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
942		"%u (box.x)\n", item->id, box->x);
943
944
945	assert(resource->target == PIPE_BUFFER);
946	assert(resource->bind & PIPE_BIND_GLOBAL);
947	assert(box->x >= 0);
948	assert(box->y == 0);
949	assert(box->z == 0);
950
951	///TODO: do it better, mapping is not possible if the pool is too big
952	return pipe_buffer_map_range(ctx, dst,
953			offset, box->width, usage, ptransfer);
954}
955
956static void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
957					       struct pipe_transfer *transfer)
958{
959	/* struct r600_resource_global are not real resources, they just map
960	 * to an offset within the compute memory pool.  The function
961	 * r600_compute_global_transfer_map() maps the memory pool
962	 * resource rather than the struct r600_resource_global passed to
963	 * it as an argument and then initalizes ptransfer->resource with
964	 * the memory pool resource (via pipe_buffer_map_range).
965	 * When transfer_unmap is called it uses the memory pool's
966	 * vtable which calls r600_buffer_transfer_map() rather than
967	 * this function.
968	 */
969	assert (!"This function should not be called");
970}
971
972static void r600_compute_global_transfer_flush_region(struct pipe_context *ctx,
973						      struct pipe_transfer *transfer,
974						      const struct pipe_box *box)
975{
976	assert(0 && "TODO");
977}
978
979static void r600_compute_global_transfer_inline_write(struct pipe_context *pipe,
980						      struct pipe_resource *resource,
981						      unsigned level,
982						      unsigned usage,
983						      const struct pipe_box *box,
984						      const void *data,
985						      unsigned stride,
986						      unsigned layer_stride)
987{
988	assert(0 && "TODO");
989}
990
991static void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
992					       struct pipe_resource *res)
993{
994	struct r600_resource_global* buffer = NULL;
995	struct r600_screen* rscreen = NULL;
996
997	assert(res->target == PIPE_BUFFER);
998	assert(res->bind & PIPE_BIND_GLOBAL);
999
1000	buffer = (struct r600_resource_global*)res;
1001	rscreen = (struct r600_screen*)screen;
1002
1003	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1004
1005	buffer->chunk = NULL;
1006	free(res);
1007}
1008
1009static const struct u_resource_vtbl r600_global_buffer_vtbl =
1010{
1011	u_default_resource_get_handle, /* get_handle */
1012	r600_compute_global_buffer_destroy, /* resource_destroy */
1013	r600_compute_global_transfer_map, /* transfer_map */
1014	r600_compute_global_transfer_flush_region,/* transfer_flush_region */
1015	r600_compute_global_transfer_unmap, /* transfer_unmap */
1016	r600_compute_global_transfer_inline_write /* transfer_inline_write */
1017};
1018
1019struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1020							const struct pipe_resource *templ)
1021{
1022	struct r600_resource_global* result = NULL;
1023	struct r600_screen* rscreen = NULL;
1024	int size_in_dw = 0;
1025
1026	assert(templ->target == PIPE_BUFFER);
1027	assert(templ->bind & PIPE_BIND_GLOBAL);
1028	assert(templ->array_size == 1 || templ->array_size == 0);
1029	assert(templ->depth0 == 1 || templ->depth0 == 0);
1030	assert(templ->height0 == 1 || templ->height0 == 0);
1031
1032	result = (struct r600_resource_global*)
1033	CALLOC(sizeof(struct r600_resource_global), 1);
1034	rscreen = (struct r600_screen*)screen;
1035
1036	COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1037	COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1038			templ->array_size);
1039
1040	result->base.b.vtbl = &r600_global_buffer_vtbl;
1041	result->base.b.b = *templ;
1042	result->base.b.b.screen = screen;
1043	pipe_reference_init(&result->base.b.b.reference, 1);
1044
1045	size_in_dw = (templ->width0+3) / 4;
1046
1047	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1048
1049	if (result->chunk == NULL)
1050	{
1051		free(result);
1052		return NULL;
1053	}
1054
1055	return &result->base.b.b;
1056}
1057