evergreen_compute.c revision aeb2be3a2f1839b91532b178b997b20ddb69eb13
1/*
2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 *      Adam Rak <adam.rak@streamnovation.com>
25 */
26
27#include <stdio.h>
28#include <errno.h>
29#include "pipe/p_defines.h"
30#include "pipe/p_state.h"
31#include "pipe/p_context.h"
32#include "util/u_blitter.h"
33#include "util/list.h"
34#include "util/u_transfer.h"
35#include "util/u_surface.h"
36#include "util/u_pack_color.h"
37#include "util/u_memory.h"
38#include "util/u_inlines.h"
39#include "util/u_framebuffer.h"
40#include "pipebuffer/pb_buffer.h"
41#include "evergreend.h"
42#include "r600_shader.h"
43#include "r600_pipe.h"
44#include "r600_formats.h"
45#include "evergreen_compute.h"
46#include "evergreen_compute_internal.h"
47#include "compute_memory_pool.h"
48#include "sb/sb_public.h"
49#ifdef HAVE_OPENCL
50#include "radeon/radeon_llvm_util.h"
51#endif
52#include "radeon/radeon_elf_util.h"
53#include <inttypes.h>
54
55/**
56RAT0 is for global binding write
57VTX1 is for global binding read
58
59for wrting images RAT1...
60for reading images TEX2...
61  TEX2-RAT1 is paired
62
63TEX2... consumes the same fetch resources, that VTX2... would consume
64
65CONST0 and VTX0 is for parameters
66  CONST0 is binding smaller input parameter buffer, and for constant indexing,
67  also constant cached
68  VTX0 is for indirect/non-constant indexing, or if the input is bigger than
69  the constant cache can handle
70
71RAT-s are limited to 12, so we can only bind at most 11 texture for writing
72because we reserve RAT0 for global bindings. With byteaddressing enabled,
73we should reserve another one too.=> 10 image binding for writing max.
74
75from Nvidia OpenCL:
76  CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
77  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
78
79so 10 for writing is enough. 176 is the max for reading according to the docs
80
81writable images should be listed first < 10, so their id corresponds to RAT(id+1)
82writable images will consume TEX slots, VTX slots too because of linear indexing
83
84*/
85
86struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
87						     unsigned size)
88{
89	struct pipe_resource *buffer = NULL;
90	assert(size);
91
92	buffer = pipe_buffer_create((struct pipe_screen*) screen,
93				    PIPE_BIND_CUSTOM,
94				    PIPE_USAGE_IMMUTABLE,
95				    size);
96
97	return (struct r600_resource *)buffer;
98}
99
100
101static void evergreen_set_rat(struct r600_pipe_compute *pipe,
102			      unsigned id,
103			      struct r600_resource *bo,
104			      int start,
105			      int size)
106{
107	struct pipe_surface rat_templ;
108	struct r600_surface *surf = NULL;
109	struct r600_context *rctx = NULL;
110
111	assert(id < 12);
112	assert((size & 3) == 0);
113	assert((start & 0xFF) == 0);
114
115	rctx = pipe->ctx;
116
117	COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
118
119	/* Create the RAT surface */
120	memset(&rat_templ, 0, sizeof(rat_templ));
121	rat_templ.format = PIPE_FORMAT_R32_UINT;
122	rat_templ.u.tex.level = 0;
123	rat_templ.u.tex.first_layer = 0;
124	rat_templ.u.tex.last_layer = 0;
125
126	/* Add the RAT the list of color buffers */
127	pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
128		(struct pipe_context *)pipe->ctx,
129		(struct pipe_resource *)bo, &rat_templ);
130
131	/* Update the number of color buffers */
132	pipe->ctx->framebuffer.state.nr_cbufs =
133		MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
134
135	/* Update the cb_target_mask
136	 * XXX: I think this is a potential spot for bugs once we start doing
137	 * GL interop.  cb_target_mask may be modified in the 3D sections
138	 * of this driver. */
139	pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
140
141	surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
142	evergreen_init_color_surface_rat(rctx, surf);
143}
144
145static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
146					   unsigned vb_index,
147					   unsigned offset,
148					   struct pipe_resource *buffer)
149{
150	struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
151	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
152	vb->stride = 1;
153	vb->buffer_offset = offset;
154	vb->buffer = buffer;
155	vb->user_buffer = NULL;
156
157	/* The vertex instructions in the compute shaders use the texture cache,
158	 * so we need to invalidate it. */
159	rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
160	state->enabled_mask |= 1 << vb_index;
161	state->dirty_mask |= 1 << vb_index;
162	r600_mark_atom_dirty(rctx, &state->atom);
163}
164
165static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
166					     unsigned cb_index,
167					     unsigned offset,
168					     unsigned size,
169					     struct pipe_resource *buffer)
170{
171	struct pipe_constant_buffer cb;
172	cb.buffer_size = size;
173	cb.buffer_offset = offset;
174	cb.buffer = buffer;
175	cb.user_buffer = NULL;
176
177	rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, &cb);
178}
179
180static const struct u_resource_vtbl r600_global_buffer_vtbl =
181{
182	u_default_resource_get_handle, /* get_handle */
183	r600_compute_global_buffer_destroy, /* resource_destroy */
184	r600_compute_global_transfer_map, /* transfer_map */
185	r600_compute_global_transfer_flush_region,/* transfer_flush_region */
186	r600_compute_global_transfer_unmap, /* transfer_unmap */
187	r600_compute_global_transfer_inline_write /* transfer_inline_write */
188};
189
190/* We need to define these R600 registers here, because we can't include
191 * evergreend.h and r600d.h.
192 */
193#define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
194#define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
195
196#ifdef HAVE_OPENCL
197
198static void r600_shader_binary_read_config(const struct radeon_shader_binary *binary,
199					   struct r600_bytecode *bc,
200					   uint64_t symbol_offset,
201					   boolean *use_kill)
202{
203       unsigned i;
204       const unsigned char *config =
205               radeon_shader_binary_config_start(binary, symbol_offset);
206
207       for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
208               unsigned reg =
209                       util_le32_to_cpu(*(uint32_t*)(config + i));
210               unsigned value =
211                       util_le32_to_cpu(*(uint32_t*)(config + i + 4));
212               switch (reg) {
213               /* R600 / R700 */
214               case R_028850_SQ_PGM_RESOURCES_PS:
215               case R_028868_SQ_PGM_RESOURCES_VS:
216               /* Evergreen / Northern Islands */
217               case R_028844_SQ_PGM_RESOURCES_PS:
218               case R_028860_SQ_PGM_RESOURCES_VS:
219               case R_0288D4_SQ_PGM_RESOURCES_LS:
220                       bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
221                       bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
222                       break;
223               case R_02880C_DB_SHADER_CONTROL:
224                       *use_kill = G_02880C_KILL_ENABLE(value);
225                       break;
226               case R_0288E8_SQ_LDS_ALLOC:
227                       bc->nlds_dw = value;
228                       break;
229               }
230       }
231}
232
233static unsigned r600_create_shader(struct r600_bytecode *bc,
234				   const struct radeon_shader_binary *binary,
235				   boolean *use_kill)
236
237{
238	assert(binary->code_size % 4 == 0);
239	bc->bytecode = CALLOC(1, binary->code_size);
240	memcpy(bc->bytecode, binary->code, binary->code_size);
241	bc->ndw = binary->code_size / 4;
242
243	r600_shader_binary_read_config(binary, bc, 0, use_kill);
244	return 0;
245}
246
247#endif
248
249static void r600_destroy_shader(struct r600_bytecode *bc)
250{
251	FREE(bc->bytecode);
252}
253
254void *evergreen_create_compute_state(struct pipe_context *ctx_,
255				     const const struct pipe_compute_state *cso)
256{
257	struct r600_context *rctx = (struct r600_context *)ctx_;
258	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
259#ifdef HAVE_OPENCL
260	const struct pipe_llvm_program_header *header;
261	const char *code;
262	void *p;
263	boolean use_kill;
264
265	COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
266	header = cso->prog;
267	code = cso->prog + sizeof(struct pipe_llvm_program_header);
268	radeon_shader_binary_init(&shader->binary);
269	radeon_elf_read(code, header->num_bytes, &shader->binary);
270	r600_create_shader(&shader->bc, &shader->binary, &use_kill);
271
272	shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
273							shader->bc.ndw * 4);
274	p = r600_buffer_map_sync_with_rings(&rctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
275	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
276	rctx->b.ws->buffer_unmap(shader->code_bo->buf);
277#endif
278
279	shader->ctx = rctx;
280	shader->local_size = cso->req_local_mem;
281	shader->private_size = cso->req_private_mem;
282	shader->input_size = cso->req_input_mem;
283
284	return shader;
285}
286
287void evergreen_delete_compute_state(struct pipe_context *ctx_, void *state)
288{
289	struct r600_context *rctx = (struct r600_context *)ctx_;
290	struct r600_pipe_compute *shader = state;
291
292	COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
293
294	if (!shader)
295		return;
296
297	radeon_shader_binary_clean(&shader->binary);
298	r600_destroy_shader(&shader->bc);
299
300	/* TODO destroy shader->code_bo, shader->const_bo
301	 * we'll need something like r600_buffer_free */
302	FREE(shader);
303}
304
305static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
306{
307	struct r600_context *rctx = (struct r600_context *)ctx_;
308
309	COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
310
311	rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
312}
313
314/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
315 * kernel parameters there are implicit parameters that need to be stored
316 * in the vertex buffer as well.  Here is how these parameters are organized in
317 * the buffer:
318 *
319 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
320 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
321 * DWORDS 6-8: Number of work items within each work group in each dimension
322 *             (x,y,z)
323 * DWORDS 9+ : Kernel parameters
324 */
325void evergreen_compute_upload_input(struct pipe_context *ctx_,
326				    const uint *block_layout,
327				    const uint *grid_layout,
328				    const void *input)
329{
330	struct r600_context *rctx = (struct r600_context *)ctx_;
331	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
332	unsigned i;
333	/* We need to reserve 9 dwords (36 bytes) for implicit kernel
334	 * parameters.
335	 */
336	unsigned input_size = shader->input_size + 36;
337	uint32_t *num_work_groups_start;
338	uint32_t *global_size_start;
339	uint32_t *local_size_start;
340	uint32_t *kernel_parameters_start;
341	struct pipe_box box;
342	struct pipe_transfer *transfer = NULL;
343
344	if (shader->input_size == 0) {
345		return;
346	}
347
348	if (!shader->kernel_param) {
349		/* Add space for the grid dimensions */
350		shader->kernel_param = (struct r600_resource *)
351			pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
352					PIPE_USAGE_IMMUTABLE, input_size);
353	}
354
355	u_box_1d(0, input_size, &box);
356	num_work_groups_start = ctx_->transfer_map(ctx_,
357			(struct pipe_resource*)shader->kernel_param,
358			0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
359			&box, &transfer);
360	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
361	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
362	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
363
364	/* Copy the work group size */
365	memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
366
367	/* Copy the global size */
368	for (i = 0; i < 3; i++) {
369		global_size_start[i] = grid_layout[i] * block_layout[i];
370	}
371
372	/* Copy the local dimensions */
373	memcpy(local_size_start, block_layout, 3 * sizeof(uint));
374
375	/* Copy the kernel inputs */
376	memcpy(kernel_parameters_start, input, shader->input_size);
377
378	for (i = 0; i < (input_size / 4); i++) {
379		COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
380			((unsigned*)num_work_groups_start)[i]);
381	}
382
383	ctx_->transfer_unmap(ctx_, transfer);
384
385	/* ID=0 is reserved for the parameters */
386	evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
387			(struct pipe_resource*)shader->kernel_param);
388}
389
390static void evergreen_emit_direct_dispatch(struct r600_context *rctx,
391					   const uint *block_layout,
392					   const uint *grid_layout)
393{
394	int i;
395	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
396	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
397	unsigned num_waves;
398	unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
399	unsigned wave_divisor = (16 * num_pipes);
400	int group_size = 1;
401	int grid_size = 1;
402	unsigned lds_size = shader->local_size / 4 +
403		shader->bc.nlds_dw;
404
405
406	/* Calculate group_size/grid_size */
407	for (i = 0; i < 3; i++) {
408		group_size *= block_layout[i];
409	}
410
411	for (i = 0; i < 3; i++)	{
412		grid_size *= grid_layout[i];
413	}
414
415	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
416	num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
417			wave_divisor - 1) / wave_divisor;
418
419	COMPUTE_DBG(rctx->screen, "Using %u pipes, "
420				"%u wavefronts per thread block, "
421				"allocating %u dwords lds.\n",
422				num_pipes, num_waves, lds_size);
423
424	radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
425
426	radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
427	radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
428	radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
429	radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
430
431	radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
432								group_size);
433
434	radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
435	radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
436	radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
437	radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
438
439	if (rctx->b.chip_class < CAYMAN) {
440		assert(lds_size <= 8192);
441	} else {
442		/* Cayman appears to have a slightly smaller limit, see the
443		 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
444		assert(lds_size <= 8160);
445	}
446
447	radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
448					lds_size | (num_waves << 14));
449
450	/* Dispatch packet */
451	radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
452	radeon_emit(cs, grid_layout[0]);
453	radeon_emit(cs, grid_layout[1]);
454	radeon_emit(cs, grid_layout[2]);
455	/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
456	radeon_emit(cs, 1);
457}
458
459static void compute_emit_cs(struct r600_context *rctx,
460			    const uint *block_layout,
461			    const uint *grid_layout)
462{
463	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
464	unsigned i;
465
466	/* make sure that the gfx ring is only one active */
467	if (rctx->b.dma.cs && rctx->b.dma.cs->cdw) {
468		rctx->b.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
469	}
470
471	/* Initialize all the compute-related registers.
472	 *
473	 * See evergreen_init_atom_start_compute_cs() in this file for the list
474	 * of registers initialized by the start_compute_cs_cmd atom.
475	 */
476	r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
477
478	/* emit config state */
479	if (rctx->b.chip_class == EVERGREEN)
480		r600_emit_atom(rctx, &rctx->config_state.atom);
481
482	rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
483	r600_flush_emit(rctx);
484
485	/* Emit colorbuffers. */
486	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
487	for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
488		struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
489		unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
490						       (struct r600_resource*)cb->base.texture,
491						       RADEON_USAGE_READWRITE,
492						       RADEON_PRIO_SHADER_RW_BUFFER);
493
494		radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
495		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
496		radeon_emit(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
497		radeon_emit(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
498		radeon_emit(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
499		radeon_emit(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
500		radeon_emit(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
501		radeon_emit(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */
502
503		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
504		radeon_emit(cs, reloc);
505
506		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
507		radeon_emit(cs, reloc);
508	}
509	for (; i < 8 ; i++)
510		radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
511					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
512	for (; i < 12; i++)
513		radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
514					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
515
516	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
517	radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
518					rctx->compute_cb_target_mask);
519
520
521	/* Emit vertex buffer state */
522	rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
523	r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
524
525	/* Emit constant buffer state */
526	r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
527
528	/* Emit sampler state */
529	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
530
531	/* Emit sampler view (texture resource) state */
532	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
533
534	/* Emit compute shader state */
535	r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
536
537	/* Emit dispatch state and dispatch packet */
538	evergreen_emit_direct_dispatch(rctx, block_layout, grid_layout);
539
540	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
541	 */
542	rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
543		      R600_CONTEXT_INV_VERTEX_CACHE |
544	              R600_CONTEXT_INV_TEX_CACHE;
545	r600_flush_emit(rctx);
546	rctx->b.flags = 0;
547
548	if (rctx->b.chip_class >= CAYMAN) {
549		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
550		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
551		/* DEALLOC_STATE prevents the GPU from hanging when a
552		 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
553		 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
554		 */
555		cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
556		cs->buf[cs->cdw++] = 0;
557	}
558
559#if 0
560	COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
561	for (i = 0; i < cs->cdw; i++) {
562		COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
563	}
564#endif
565
566}
567
568
569/**
570 * Emit function for r600_cs_shader_state atom
571 */
572void evergreen_emit_cs_shader(struct r600_context *rctx,
573			      struct r600_atom *atom)
574{
575	struct r600_cs_shader_state *state =
576					(struct r600_cs_shader_state*)atom;
577	struct r600_pipe_compute *shader = state->shader;
578	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
579	uint64_t va;
580	struct r600_resource *code_bo;
581	unsigned ngpr, nstack;
582
583	code_bo = shader->code_bo;
584	va = shader->code_bo->gpu_address + state->pc;
585	ngpr = shader->bc.ngpr;
586	nstack = shader->bc.nstack;
587
588	radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
589	radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
590	radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
591			S_0288D4_NUM_GPRS(ngpr)
592			| S_0288D4_STACK_SIZE(nstack));
593	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
594
595	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
596	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
597					      code_bo, RADEON_USAGE_READ,
598					      RADEON_PRIO_USER_SHADER));
599}
600
601static void evergreen_launch_grid(struct pipe_context *ctx_,
602				  const struct pipe_grid_info *info)
603{
604	struct r600_context *rctx = (struct r600_context *)ctx_;
605#ifdef HAVE_OPENCL
606	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
607	boolean use_kill;
608
609	rctx->cs_shader_state.pc = info->pc;
610	/* Get the config information for this kernel. */
611	r600_shader_binary_read_config(&shader->binary, &shader->bc,
612                                  info->pc, &use_kill);
613#endif
614
615	COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
616
617
618	evergreen_compute_upload_input(ctx_, info->block, info->grid, info->input);
619	compute_emit_cs(rctx, info->block, info->grid);
620}
621
622static void evergreen_set_compute_resources(struct pipe_context *ctx_,
623					    unsigned start, unsigned count,
624					    struct pipe_surface **surfaces)
625{
626	struct r600_context *rctx = (struct r600_context *)ctx_;
627	struct r600_surface **resources = (struct r600_surface **)surfaces;
628
629	COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
630			start, count);
631
632	for (unsigned i = 0; i < count; i++) {
633		/* The First two vertex buffers are reserved for parameters and
634		 * global buffers. */
635		unsigned vtx_id = 2 + i;
636		if (resources[i]) {
637			struct r600_resource_global *buffer =
638				(struct r600_resource_global*)
639				resources[i]->base.texture;
640			if (resources[i]->base.writable) {
641				assert(i+1 < 12);
642
643				evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
644				(struct r600_resource *)resources[i]->base.texture,
645				buffer->chunk->start_in_dw*4,
646				resources[i]->base.texture->width0);
647			}
648
649			evergreen_cs_set_vertex_buffer(rctx, vtx_id,
650					buffer->chunk->start_in_dw * 4,
651					resources[i]->base.texture);
652		}
653	}
654}
655
656static void evergreen_set_global_binding(struct pipe_context *ctx_,
657					 unsigned first, unsigned n,
658					 struct pipe_resource **resources,
659					 uint32_t **handles)
660{
661	struct r600_context *rctx = (struct r600_context *)ctx_;
662	struct compute_memory_pool *pool = rctx->screen->global_pool;
663	struct r600_resource_global **buffers =
664		(struct r600_resource_global **)resources;
665	unsigned i;
666
667	COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
668			first, n);
669
670	if (!resources) {
671		/* XXX: Unset */
672		return;
673	}
674
675	/* We mark these items for promotion to the pool if they
676	 * aren't already there */
677	for (i = first; i < first + n; i++) {
678		struct compute_memory_item *item = buffers[i]->chunk;
679
680		if (!is_item_in_pool(item))
681			buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
682	}
683
684	if (compute_memory_finalize_pending(pool, ctx_) == -1) {
685		/* XXX: Unset */
686		return;
687	}
688
689	for (i = first; i < first + n; i++)
690	{
691		uint32_t buffer_offset;
692		uint32_t handle;
693		assert(resources[i]->target == PIPE_BUFFER);
694		assert(resources[i]->bind & PIPE_BIND_GLOBAL);
695
696		buffer_offset = util_le32_to_cpu(*(handles[i]));
697		handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
698
699		*(handles[i]) = util_cpu_to_le32(handle);
700	}
701
702	evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
703	evergreen_cs_set_vertex_buffer(rctx, 1, 0,
704				(struct pipe_resource*)pool->bo);
705}
706
707/**
708 * This function initializes all the compute specific registers that need to
709 * be initialized for each compute command stream.  Registers that are common
710 * to both compute and 3D will be initialized at the beginning of each compute
711 * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
712 * packet requires that the shader type bit be set, we must initialize all
713 * context registers needed for compute in this function.  The registers
714 * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
715 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
716 * on the GPU family.
717 */
718void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
719{
720	struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
721	int num_threads;
722	int num_stack_entries;
723
724	/* since all required registers are initialized in the
725	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
726	 */
727	r600_init_command_buffer(cb, 256);
728	cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
729
730	/* This must be first. */
731	r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
732	r600_store_value(cb, 0x80000000);
733	r600_store_value(cb, 0x80000000);
734
735	/* We're setting config registers here. */
736	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
737	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
738
739	switch (rctx->b.family) {
740	case CHIP_CEDAR:
741	default:
742		num_threads = 128;
743		num_stack_entries = 256;
744		break;
745	case CHIP_REDWOOD:
746		num_threads = 128;
747		num_stack_entries = 256;
748		break;
749	case CHIP_JUNIPER:
750		num_threads = 128;
751		num_stack_entries = 512;
752		break;
753	case CHIP_CYPRESS:
754	case CHIP_HEMLOCK:
755		num_threads = 128;
756		num_stack_entries = 512;
757		break;
758	case CHIP_PALM:
759		num_threads = 128;
760		num_stack_entries = 256;
761		break;
762	case CHIP_SUMO:
763		num_threads = 128;
764		num_stack_entries = 256;
765		break;
766	case CHIP_SUMO2:
767		num_threads = 128;
768		num_stack_entries = 512;
769		break;
770	case CHIP_BARTS:
771		num_threads = 128;
772		num_stack_entries = 512;
773		break;
774	case CHIP_TURKS:
775		num_threads = 128;
776		num_stack_entries = 256;
777		break;
778	case CHIP_CAICOS:
779		num_threads = 128;
780		num_stack_entries = 256;
781		break;
782	}
783
784	/* Config Registers */
785	if (rctx->b.chip_class < CAYMAN)
786		evergreen_init_common_regs(rctx, cb, rctx->b.chip_class, rctx->b.family,
787					   rctx->screen->b.info.drm_minor);
788	else
789		cayman_init_common_regs(cb, rctx->b.chip_class, rctx->b.family,
790					rctx->screen->b.info.drm_minor);
791
792	/* The primitive type always needs to be POINTLIST for compute. */
793	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
794						V_008958_DI_PT_POINTLIST);
795
796	if (rctx->b.chip_class < CAYMAN) {
797
798		/* These registers control which simds can be used by each stage.
799		 * The default for these registers is 0xffffffff, which means
800		 * all simds are available for each stage.  It's possible we may
801		 * want to play around with these in the future, but for now
802		 * the default value is fine.
803		 *
804		 * R_008E20_SQ_STATIC_THREAD_MGMT1
805		 * R_008E24_SQ_STATIC_THREAD_MGMT2
806		 * R_008E28_SQ_STATIC_THREAD_MGMT3
807		 */
808
809		/* XXX: We may need to adjust the thread and stack resource
810		 * values for 3D/compute interop */
811
812		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
813
814		/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
815		 * Set the number of threads used by the PS/VS/GS/ES stage to
816		 * 0.
817		 */
818		r600_store_value(cb, 0);
819
820		/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
821		 * Set the number of threads used by the CS (aka LS) stage to
822		 * the maximum number of threads and set the number of threads
823		 * for the HS stage to 0. */
824		r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
825
826		/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
827		 * Set the Control Flow stack entries to 0 for PS/VS stages */
828		r600_store_value(cb, 0);
829
830		/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
831		 * Set the Control Flow stack entries to 0 for GS/ES stages */
832		r600_store_value(cb, 0);
833
834		/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
835		 * Set the Contol Flow stack entries to 0 for the HS stage, and
836		 * set it to the maximum value for the CS (aka LS) stage. */
837		r600_store_value(cb,
838			S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
839	}
840	/* Give the compute shader all the available LDS space.
841	 * NOTE: This only sets the maximum number of dwords that a compute
842	 * shader can allocate.  When a shader is executed, we still need to
843	 * allocate the appropriate amount of LDS dwords using the
844	 * CM_R_0288E8_SQ_LDS_ALLOC register.
845	 */
846	if (rctx->b.chip_class < CAYMAN) {
847		r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
848			S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
849	} else {
850		r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
851			S_0286FC_NUM_PS_LDS(0) |
852			S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
853	}
854
855	/* Context Registers */
856
857	if (rctx->b.chip_class < CAYMAN) {
858		/* workaround for hw issues with dyn gpr - must set all limits
859		 * to 240 instead of 0, 0x1e == 240 / 8
860		 */
861		r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
862				S_028838_PS_GPRS(0x1e) |
863				S_028838_VS_GPRS(0x1e) |
864				S_028838_GS_GPRS(0x1e) |
865				S_028838_ES_GPRS(0x1e) |
866				S_028838_HS_GPRS(0x1e) |
867				S_028838_LS_GPRS(0x1e));
868	}
869
870	/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
871	r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
872		S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
873
874	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
875
876	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
877						S_0286E8_TID_IN_GROUP_ENA
878						| S_0286E8_TGID_ENA
879						| S_0286E8_DISABLE_INDEX_PACK)
880						;
881
882	/* The LOOP_CONST registers are an optimizations for loops that allows
883	 * you to store the initial counter, increment value, and maximum
884	 * counter value in a register so that hardware can calculate the
885	 * correct number of iterations for the loop, so that you don't need
886	 * to have the loop counter in your shader code.  We don't currently use
887	 * this optimization, so we must keep track of the counter in the
888	 * shader and use a break instruction to exit loops.  However, the
889	 * hardware will still uses this register to determine when to exit a
890	 * loop, so we need to initialize the counter to 0, set the increment
891	 * value to 1 and the maximum counter value to the 4095 (0xfff) which
892	 * is the maximum value allowed.  This gives us a maximum of 4096
893	 * iterations for our loops, but hopefully our break instruction will
894	 * execute before some time before the 4096th iteration.
895	 */
896	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
897}
898
899void evergreen_init_compute_state_functions(struct r600_context *rctx)
900{
901	rctx->b.b.create_compute_state = evergreen_create_compute_state;
902	rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
903	rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
904//	 rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
905	rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
906	rctx->b.b.set_global_binding = evergreen_set_global_binding;
907	rctx->b.b.launch_grid = evergreen_launch_grid;
908
909}
910
911struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
912							const struct pipe_resource *templ)
913{
914	struct r600_resource_global* result = NULL;
915	struct r600_screen* rscreen = NULL;
916	int size_in_dw = 0;
917
918	assert(templ->target == PIPE_BUFFER);
919	assert(templ->bind & PIPE_BIND_GLOBAL);
920	assert(templ->array_size == 1 || templ->array_size == 0);
921	assert(templ->depth0 == 1 || templ->depth0 == 0);
922	assert(templ->height0 == 1 || templ->height0 == 0);
923
924	result = (struct r600_resource_global*)
925	CALLOC(sizeof(struct r600_resource_global), 1);
926	rscreen = (struct r600_screen*)screen;
927
928	COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
929	COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
930			templ->array_size);
931
932	result->base.b.vtbl = &r600_global_buffer_vtbl;
933	result->base.b.b = *templ;
934	result->base.b.b.screen = screen;
935	pipe_reference_init(&result->base.b.b.reference, 1);
936
937	size_in_dw = (templ->width0+3) / 4;
938
939	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
940
941	if (result->chunk == NULL)
942	{
943		free(result);
944		return NULL;
945	}
946
947	return &result->base.b.b;
948}
949
950void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
951					struct pipe_resource *res)
952{
953	struct r600_resource_global* buffer = NULL;
954	struct r600_screen* rscreen = NULL;
955
956	assert(res->target == PIPE_BUFFER);
957	assert(res->bind & PIPE_BIND_GLOBAL);
958
959	buffer = (struct r600_resource_global*)res;
960	rscreen = (struct r600_screen*)screen;
961
962	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
963
964	buffer->chunk = NULL;
965	free(res);
966}
967
968void *r600_compute_global_transfer_map(struct pipe_context *ctx_,
969				       struct pipe_resource *resource,
970				       unsigned level,
971				       unsigned usage,
972				       const struct pipe_box *box,
973				       struct pipe_transfer **ptransfer)
974{
975	struct r600_context *rctx = (struct r600_context*)ctx_;
976	struct compute_memory_pool *pool = rctx->screen->global_pool;
977	struct r600_resource_global* buffer =
978		(struct r600_resource_global*)resource;
979
980	struct compute_memory_item *item = buffer->chunk;
981	struct pipe_resource *dst = NULL;
982	unsigned offset = box->x;
983
984	if (is_item_in_pool(item)) {
985		compute_memory_demote_item(pool, item, ctx_);
986	}
987	else {
988		if (item->real_buffer == NULL) {
989			item->real_buffer =
990					r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
991		}
992	}
993
994	dst = (struct pipe_resource*)item->real_buffer;
995
996	if (usage & PIPE_TRANSFER_READ)
997		buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
998
999	COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1000			"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1001			"width = %u, height = %u, depth = %u)\n", level, usage,
1002			box->x, box->y, box->z, box->width, box->height,
1003			box->depth);
1004	COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1005		"%u (box.x)\n", item->id, box->x);
1006
1007
1008	assert(resource->target == PIPE_BUFFER);
1009	assert(resource->bind & PIPE_BIND_GLOBAL);
1010	assert(box->x >= 0);
1011	assert(box->y == 0);
1012	assert(box->z == 0);
1013
1014	///TODO: do it better, mapping is not possible if the pool is too big
1015	return pipe_buffer_map_range(ctx_, dst,
1016			offset, box->width, usage, ptransfer);
1017}
1018
1019void r600_compute_global_transfer_unmap(struct pipe_context *ctx_,
1020					struct pipe_transfer *transfer)
1021{
1022	/* struct r600_resource_global are not real resources, they just map
1023	 * to an offset within the compute memory pool.  The function
1024	 * r600_compute_global_transfer_map() maps the memory pool
1025	 * resource rather than the struct r600_resource_global passed to
1026	 * it as an argument and then initalizes ptransfer->resource with
1027	 * the memory pool resource (via pipe_buffer_map_range).
1028	 * When transfer_unmap is called it uses the memory pool's
1029	 * vtable which calls r600_buffer_transfer_map() rather than
1030	 * this function.
1031	 */
1032	assert (!"This function should not be called");
1033}
1034
1035void r600_compute_global_transfer_flush_region(struct pipe_context *ctx_,
1036					       struct pipe_transfer *transfer,
1037					       const struct pipe_box *box)
1038{
1039	assert(0 && "TODO");
1040}
1041
1042void r600_compute_global_transfer_inline_write(struct pipe_context *pipe,
1043					       struct pipe_resource *resource,
1044					       unsigned level,
1045					       unsigned usage,
1046					       const struct pipe_box *box,
1047					       const void *data,
1048					       unsigned stride,
1049					       unsigned layer_stride)
1050{
1051	assert(0 && "TODO");
1052}
1053