evergreen_compute.c revision 65d67bcc4b23ca5b1fe1bd961fffb7ecf50864b9
1/*
2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 *      Adam Rak <adam.rak@streamnovation.com>
25 */
26
27#include <stdio.h>
28#include <errno.h>
29#include "pipe/p_defines.h"
30#include "pipe/p_state.h"
31#include "pipe/p_context.h"
32#include "util/u_blitter.h"
33#include "util/u_double_list.h"
34#include "util/u_transfer.h"
35#include "util/u_surface.h"
36#include "util/u_pack_color.h"
37#include "util/u_memory.h"
38#include "util/u_inlines.h"
39#include "util/u_framebuffer.h"
40#include "pipebuffer/pb_buffer.h"
41#include "evergreend.h"
42#include "r600_resource.h"
43#include "r600_shader.h"
44#include "r600_pipe.h"
45#include "r600_formats.h"
46#include "evergreen_compute.h"
47#include "evergreen_compute_internal.h"
48#include "compute_memory_pool.h"
49#ifdef HAVE_OPENCL
50#include "radeon_llvm_util.h"
51#endif
52
53/**
54RAT0 is for global binding write
55VTX1 is for global binding read
56
57for wrting images RAT1...
58for reading images TEX2...
59  TEX2-RAT1 is paired
60
61TEX2... consumes the same fetch resources, that VTX2... would consume
62
63CONST0 and VTX0 is for parameters
64  CONST0 is binding smaller input parameter buffer, and for constant indexing,
65  also constant cached
66  VTX0 is for indirect/non-constant indexing, or if the input is bigger than
67  the constant cache can handle
68
69RAT-s are limited to 12, so we can only bind at most 11 texture for writing
70because we reserve RAT0 for global bindings. With byteaddressing enabled,
71we should reserve another one too.=> 10 image binding for writing max.
72
73from Nvidia OpenCL:
74  CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
75  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
76
77so 10 for writing is enough. 176 is the max for reading according to the docs
78
79writable images should be listed first < 10, so their id corresponds to RAT(id+1)
80writable images will consume TEX slots, VTX slots too because of linear indexing
81
82*/
83
84struct r600_resource* r600_compute_buffer_alloc_vram(
85       struct r600_screen *screen,
86       unsigned size)
87{
88	struct pipe_resource * buffer = NULL;
89	assert(size);
90
91	buffer = pipe_buffer_create(
92		(struct pipe_screen*) screen,
93		PIPE_BIND_CUSTOM,
94		PIPE_USAGE_IMMUTABLE,
95		size);
96
97	return (struct r600_resource *)buffer;
98}
99
100
101static void evergreen_set_rat(
102	struct r600_pipe_compute *pipe,
103	int id,
104	struct r600_resource* bo,
105	int start,
106	int size)
107{
108	struct pipe_surface rat_templ;
109	struct r600_surface *surf = NULL;
110	struct r600_context *rctx = NULL;
111
112	assert(id < 12);
113	assert((size & 3) == 0);
114	assert((start & 0xFF) == 0);
115
116	rctx = pipe->ctx;
117
118	COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
119
120	/* Create the RAT surface */
121	memset(&rat_templ, 0, sizeof(rat_templ));
122	rat_templ.format = PIPE_FORMAT_R32_UINT;
123	rat_templ.u.tex.level = 0;
124	rat_templ.u.tex.first_layer = 0;
125	rat_templ.u.tex.last_layer = 0;
126
127	/* Add the RAT the list of color buffers */
128	pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->context.create_surface(
129		(struct pipe_context *)pipe->ctx,
130		(struct pipe_resource *)bo, &rat_templ);
131
132	/* Update the number of color buffers */
133	pipe->ctx->framebuffer.state.nr_cbufs =
134		MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
135
136	/* Update the cb_target_mask
137	 * XXX: I think this is a potential spot for bugs once we start doing
138	 * GL interop.  cb_target_mask may be modified in the 3D sections
139	 * of this driver. */
140	pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
141
142	surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
143	evergreen_init_color_surface_rat(rctx, surf);
144}
145
146static void evergreen_cs_set_vertex_buffer(
147	struct r600_context * rctx,
148	unsigned vb_index,
149	unsigned offset,
150	struct pipe_resource * buffer)
151{
152	struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
153	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
154	vb->stride = 1;
155	vb->buffer_offset = offset;
156	vb->buffer = buffer;
157	vb->user_buffer = NULL;
158
159	/* The vertex instructions in the compute shaders use the texture cache,
160	 * so we need to invalidate it. */
161	rctx->flags |= R600_CONTEXT_INVAL_READ_CACHES;
162	state->enabled_mask |= 1 << vb_index;
163	state->dirty_mask |= 1 << vb_index;
164	state->atom.dirty = true;
165}
166
167static void evergreen_cs_set_constant_buffer(
168	struct r600_context * rctx,
169	unsigned cb_index,
170	unsigned offset,
171	unsigned size,
172	struct pipe_resource * buffer)
173{
174	struct pipe_constant_buffer cb;
175	cb.buffer_size = size;
176	cb.buffer_offset = offset;
177	cb.buffer = buffer;
178	cb.user_buffer = NULL;
179
180	rctx->context.set_constant_buffer(&rctx->context, PIPE_SHADER_COMPUTE, cb_index, &cb);
181}
182
183static const struct u_resource_vtbl r600_global_buffer_vtbl =
184{
185	u_default_resource_get_handle, /* get_handle */
186	r600_compute_global_buffer_destroy, /* resource_destroy */
187	r600_compute_global_transfer_map, /* transfer_map */
188	r600_compute_global_transfer_flush_region,/* transfer_flush_region */
189	r600_compute_global_transfer_unmap, /* transfer_unmap */
190	r600_compute_global_transfer_inline_write /* transfer_inline_write */
191};
192
193
194void *evergreen_create_compute_state(
195	struct pipe_context *ctx_,
196	const const struct pipe_compute_state *cso)
197{
198	struct r600_context *ctx = (struct r600_context *)ctx_;
199	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
200
201#ifdef HAVE_OPENCL
202	const struct pipe_llvm_program_header * header;
203	const unsigned char * code;
204	unsigned i;
205
206	COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
207
208	header = cso->prog;
209	code = cso->prog + sizeof(struct pipe_llvm_program_header);
210#endif
211
212	shader->ctx = (struct r600_context*)ctx;
213	shader->local_size = cso->req_local_mem; ///TODO: assert it
214	shader->private_size = cso->req_private_mem;
215	shader->input_size = cso->req_input_mem;
216
217#ifdef HAVE_OPENCL
218	shader->num_kernels = radeon_llvm_get_num_kernels(code, header->num_bytes);
219	shader->kernels = CALLOC(sizeof(struct r600_kernel), shader->num_kernels);
220
221	for (i = 0; i < shader->num_kernels; i++) {
222		struct r600_kernel *kernel = &shader->kernels[i];
223		kernel->llvm_module = radeon_llvm_get_kernel_module(i, code,
224							header->num_bytes);
225	}
226#endif
227	return shader;
228}
229
230void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
231{
232	struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
233
234	free(shader);
235}
236
237static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
238{
239	struct r600_context *ctx = (struct r600_context *)ctx_;
240
241	COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
242
243	ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
244}
245
246/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
247 * kernel parameters there are inplicit parameters that need to be stored
248 * in the vertex buffer as well.  Here is how these parameters are organized in
249 * the buffer:
250 *
251 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
252 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
253 * DWORDS 6-8: Number of work items within each work group in each dimension
254 *             (x,y,z)
255 * DWORDS 9+ : Kernel parameters
256 */
257void evergreen_compute_upload_input(
258	struct pipe_context *ctx_,
259	const uint *block_layout,
260	const uint *grid_layout,
261	const void *input)
262{
263	struct r600_context *ctx = (struct r600_context *)ctx_;
264	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
265	int i;
266	/* We need to reserve 9 dwords (36 bytes) for implicit kernel
267	 * parameters.
268	 */
269	unsigned input_size = shader->input_size + 36;
270	uint32_t * num_work_groups_start;
271	uint32_t * global_size_start;
272	uint32_t * local_size_start;
273	uint32_t * kernel_parameters_start;
274	struct pipe_box box;
275	struct pipe_transfer *transfer = NULL;
276
277	if (shader->input_size == 0) {
278		return;
279	}
280
281	if (!shader->kernel_param) {
282		/* Add space for the grid dimensions */
283		shader->kernel_param = (struct r600_resource *)
284			pipe_buffer_create(ctx_->screen, PIPE_BIND_CUSTOM,
285					PIPE_USAGE_IMMUTABLE, input_size);
286	}
287
288	u_box_1d(0, input_size, &box);
289	num_work_groups_start = ctx_->transfer_map(ctx_,
290			(struct pipe_resource*)shader->kernel_param,
291			0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE,
292			&box, &transfer);
293	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
294	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
295	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
296
297	/* Copy the work group size */
298	memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
299
300	/* Copy the global size */
301	for (i = 0; i < 3; i++) {
302		global_size_start[i] = grid_layout[i] * block_layout[i];
303	}
304
305	/* Copy the local dimensions */
306	memcpy(local_size_start, block_layout, 3 * sizeof(uint));
307
308	/* Copy the kernel inputs */
309	memcpy(kernel_parameters_start, input, shader->input_size);
310
311	for (i = 0; i < (input_size / 4); i++) {
312		COMPUTE_DBG(ctx->screen, "input %i : %i\n", i,
313			((unsigned*)num_work_groups_start)[i]);
314	}
315
316	ctx_->transfer_unmap(ctx_, transfer);
317
318	/* ID=0 is reserved for the parameters */
319	evergreen_cs_set_constant_buffer(ctx, 0, 0, input_size,
320			(struct pipe_resource*)shader->kernel_param);
321}
322
323static void evergreen_emit_direct_dispatch(
324		struct r600_context *rctx,
325		const uint *block_layout, const uint *grid_layout)
326{
327	int i;
328	struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
329	unsigned num_waves;
330	unsigned num_pipes = rctx->screen->info.r600_max_pipes;
331	unsigned wave_divisor = (16 * num_pipes);
332	int group_size = 1;
333	int grid_size = 1;
334	/* XXX: Enable lds and get size from cs_shader_state */
335	unsigned lds_size = 0;
336
337	/* Calculate group_size/grid_size */
338	for (i = 0; i < 3; i++) {
339		group_size *= block_layout[i];
340	}
341
342	for (i = 0; i < 3; i++)	{
343		grid_size *= grid_layout[i];
344	}
345
346	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
347	num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
348			wave_divisor - 1) / wave_divisor;
349
350	COMPUTE_DBG(rctx->screen, "Using %u pipes, there are %u wavefronts per thread block\n",
351							num_pipes, num_waves);
352
353	/* XXX: Partition the LDS between PS/CS.  By default half (4096 dwords
354	 * on Evergreen) oes to Pixel Shaders and half goes to Compute Shaders.
355	 * We may need to allocat the entire LDS space for Compute Shaders.
356	 *
357	 * EG: R_008E2C_SQ_LDS_RESOURCE_MGMT := S_008E2C_NUM_LS_LDS(lds_dwords)
358	 * CM: CM_R_0286FC_SPI_LDS_MGMT :=  S_0286FC_NUM_LS_LDS(lds_dwords)
359	 */
360
361	r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
362
363	r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
364	r600_write_value(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
365	r600_write_value(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
366	r600_write_value(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
367
368	r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
369								group_size);
370
371	r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
372	r600_write_value(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
373	r600_write_value(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
374	r600_write_value(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
375
376	r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC,
377					lds_size | (num_waves << 14));
378
379	/* Dispatch packet */
380	r600_write_value(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
381	r600_write_value(cs, grid_layout[0]);
382	r600_write_value(cs, grid_layout[1]);
383	r600_write_value(cs, grid_layout[2]);
384	/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
385	r600_write_value(cs, 1);
386}
387
388static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
389		const uint *grid_layout)
390{
391	struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
392	unsigned flush_flags = 0;
393	int i;
394
395	/* make sure that the gfx ring is only one active */
396	if (ctx->rings.dma.cs) {
397		ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC);
398	}
399
400	/* Initialize all the compute-related registers.
401	 *
402	 * See evergreen_init_atom_start_compute_cs() in this file for the list
403	 * of registers initialized by the start_compute_cs_cmd atom.
404	 */
405	r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
406
407	ctx->flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
408	r600_flush_emit(ctx);
409
410	/* Emit colorbuffers. */
411	for (i = 0; i < ctx->framebuffer.state.nr_cbufs; i++) {
412		struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
413		unsigned reloc = r600_context_bo_reloc(ctx, &ctx->rings.gfx,
414						       (struct r600_resource*)cb->base.texture,
415						       RADEON_USAGE_READWRITE);
416
417		r600_write_compute_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
418		r600_write_value(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
419		r600_write_value(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
420		r600_write_value(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
421		r600_write_value(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
422		r600_write_value(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
423		r600_write_value(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
424		r600_write_value(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */
425
426		r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
427		r600_write_value(cs, reloc);
428
429		if (!ctx->keep_tiling_flags) {
430			r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
431			r600_write_value(cs, reloc);
432		}
433
434		r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
435		r600_write_value(cs, reloc);
436	}
437
438	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
439	r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK,
440					ctx->compute_cb_target_mask);
441
442
443	/* Emit vertex buffer state */
444	ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
445	r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
446
447	/* Emit constant buffer state */
448	r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
449
450	/* Emit compute shader state */
451	r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
452
453	/* Emit dispatch state and dispatch packet */
454	evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
455
456	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
457	 */
458	ctx->flags |= R600_CONTEXT_INVAL_READ_CACHES;
459	r600_flush_emit(ctx);
460
461#if 0
462	COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
463	for (i = 0; i < cs->cdw; i++) {
464		COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
465	}
466#endif
467
468	flush_flags = RADEON_FLUSH_ASYNC | RADEON_FLUSH_COMPUTE;
469	if (ctx->keep_tiling_flags) {
470		flush_flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
471	}
472
473	ctx->ws->cs_flush(ctx->rings.gfx.cs, flush_flags, ctx->screen->cs_count++);
474
475	ctx->flags = 0;
476
477	COMPUTE_DBG(ctx->screen, "shader started\n");
478}
479
480
481/**
482 * Emit function for r600_cs_shader_state atom
483 */
484void evergreen_emit_cs_shader(
485		struct r600_context *rctx,
486		struct r600_atom *atom)
487{
488	struct r600_cs_shader_state *state =
489					(struct r600_cs_shader_state*)atom;
490	struct r600_pipe_compute *shader = state->shader;
491	struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
492	struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
493	uint64_t va;
494
495	va = r600_resource_va(&rctx->screen->screen, &kernel->code_bo->b.b);
496
497	r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
498	r600_write_value(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
499	r600_write_value(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
500			S_0288D4_NUM_GPRS(kernel->bc.ngpr)
501			| S_0288D4_STACK_SIZE(kernel->bc.nstack));
502	r600_write_value(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
503
504	r600_write_value(cs, PKT3C(PKT3_NOP, 0, 0));
505	r600_write_value(cs, r600_context_bo_reloc(rctx, &rctx->rings.gfx,
506							kernel->code_bo, RADEON_USAGE_READ));
507
508	rctx->flags |= R600_CONTEXT_INVAL_READ_CACHES;
509}
510
511static void evergreen_launch_grid(
512		struct pipe_context *ctx_,
513		const uint *block_layout, const uint *grid_layout,
514		uint32_t pc, const void *input)
515{
516	struct r600_context *ctx = (struct r600_context *)ctx_;
517
518#ifdef HAVE_OPENCL
519	COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);
520
521	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
522	if (!shader->kernels[pc].code_bo) {
523		void *p;
524		struct r600_kernel *kernel = &shader->kernels[pc];
525		r600_compute_shader_create(ctx_, kernel->llvm_module, &kernel->bc);
526		kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
527							kernel->bc.ndw * 4);
528		p = r600_buffer_mmap_sync_with_rings(ctx, kernel->code_bo, PIPE_TRANSFER_WRITE);
529		memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
530		ctx->ws->buffer_unmap(kernel->code_bo->cs_buf);
531	}
532#endif
533
534	ctx->cs_shader_state.kernel_index = pc;
535	evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
536	compute_emit_cs(ctx, block_layout, grid_layout);
537}
538
539static void evergreen_set_compute_resources(struct pipe_context * ctx_,
540		unsigned start, unsigned count,
541		struct pipe_surface ** surfaces)
542{
543	struct r600_context *ctx = (struct r600_context *)ctx_;
544	struct r600_surface **resources = (struct r600_surface **)surfaces;
545
546	COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
547			start, count);
548
549	for (int i = 0; i < count; i++)	{
550		/* The First two vertex buffers are reserved for parameters and
551		 * global buffers. */
552		unsigned vtx_id = 2 + i;
553		if (resources[i]) {
554			struct r600_resource_global *buffer =
555				(struct r600_resource_global*)
556				resources[i]->base.texture;
557			if (resources[i]->base.writable) {
558				assert(i+1 < 12);
559
560				evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
561				(struct r600_resource *)resources[i]->base.texture,
562				buffer->chunk->start_in_dw*4,
563				resources[i]->base.texture->width0);
564			}
565
566			evergreen_cs_set_vertex_buffer(ctx, vtx_id,
567					buffer->chunk->start_in_dw * 4,
568					resources[i]->base.texture);
569		}
570	}
571}
572
573static void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
574		unsigned start_slot, unsigned count,
575		struct pipe_sampler_view **views)
576{
577	struct r600_pipe_sampler_view **resource =
578		(struct r600_pipe_sampler_view **)views;
579
580	for (int i = 0; i < count; i++)	{
581		if (resource[i]) {
582			assert(i+1 < 12);
583			/* XXX: Implement */
584			assert(!"Compute samplers not implemented.");
585			///FETCH0 = VTX0 (param buffer),
586			//FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
587		}
588	}
589}
590
591static void evergreen_bind_compute_sampler_states(
592	struct pipe_context *ctx_,
593	unsigned start_slot,
594	unsigned num_samplers,
595	void **samplers_)
596{
597	struct compute_sampler_state ** samplers =
598		(struct compute_sampler_state **)samplers_;
599
600	for (int i = 0; i < num_samplers; i++) {
601		if (samplers[i]) {
602			/* XXX: Implement */
603			assert(!"Compute samplers not implemented.");
604		}
605	}
606}
607
608static void evergreen_set_global_binding(
609	struct pipe_context *ctx_, unsigned first, unsigned n,
610	struct pipe_resource **resources,
611	uint32_t **handles)
612{
613	struct r600_context *ctx = (struct r600_context *)ctx_;
614	struct compute_memory_pool *pool = ctx->screen->global_pool;
615	struct r600_resource_global **buffers =
616		(struct r600_resource_global **)resources;
617
618	COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
619			first, n);
620
621	if (!resources) {
622		/* XXX: Unset */
623		return;
624	}
625
626	compute_memory_finalize_pending(pool, ctx_);
627
628	for (int i = 0; i < n; i++)
629	{
630		assert(resources[i]->target == PIPE_BUFFER);
631		assert(resources[i]->bind & PIPE_BIND_GLOBAL);
632
633		*(handles[i]) = buffers[i]->chunk->start_in_dw * 4;
634	}
635
636	evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
637	evergreen_cs_set_vertex_buffer(ctx, 1, 0,
638				(struct pipe_resource*)pool->bo);
639}
640
641/**
642 * This function initializes all the compute specific registers that need to
643 * be initialized for each compute command stream.  Registers that are common
644 * to both compute and 3D will be initialized at the beginning of each compute
645 * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
646 * packet requires that the shader type bit be set, we must initialize all
647 * context registers needed for compute in this function.  The registers
648 * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
649 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
650 * on the GPU family.
651 */
652void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
653{
654	struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
655	int num_threads;
656	int num_stack_entries;
657
658	/* since all required registers are initialised in the
659	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
660	 */
661	r600_init_command_buffer(cb, 256);
662	cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
663
664	/* This must be first. */
665	r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
666	r600_store_value(cb, 0x80000000);
667	r600_store_value(cb, 0x80000000);
668
669	/* We're setting config registers here. */
670	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
671	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
672
673	switch (ctx->family) {
674	case CHIP_CEDAR:
675	default:
676		num_threads = 128;
677		num_stack_entries = 256;
678		break;
679	case CHIP_REDWOOD:
680		num_threads = 128;
681		num_stack_entries = 256;
682		break;
683	case CHIP_JUNIPER:
684		num_threads = 128;
685		num_stack_entries = 512;
686		break;
687	case CHIP_CYPRESS:
688	case CHIP_HEMLOCK:
689		num_threads = 128;
690		num_stack_entries = 512;
691		break;
692	case CHIP_PALM:
693		num_threads = 128;
694		num_stack_entries = 256;
695		break;
696	case CHIP_SUMO:
697		num_threads = 128;
698		num_stack_entries = 256;
699		break;
700	case CHIP_SUMO2:
701		num_threads = 128;
702		num_stack_entries = 512;
703		break;
704	case CHIP_BARTS:
705		num_threads = 128;
706		num_stack_entries = 512;
707		break;
708	case CHIP_TURKS:
709		num_threads = 128;
710		num_stack_entries = 256;
711		break;
712	case CHIP_CAICOS:
713		num_threads = 128;
714		num_stack_entries = 256;
715		break;
716	}
717
718	/* Config Registers */
719	if (ctx->chip_class < CAYMAN)
720		evergreen_init_common_regs(cb, ctx->chip_class, ctx->family,
721					   ctx->screen->info.drm_minor);
722	else
723		cayman_init_common_regs(cb, ctx->chip_class, ctx->family,
724					ctx->screen->info.drm_minor);
725
726	/* The primitive type always needs to be POINTLIST for compute. */
727	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
728						V_008958_DI_PT_POINTLIST);
729
730	if (ctx->chip_class < CAYMAN) {
731
732		/* These registers control which simds can be used by each stage.
733		 * The default for these registers is 0xffffffff, which means
734		 * all simds are available for each stage.  It's possible we may
735		 * want to play around with these in the future, but for now
736		 * the default value is fine.
737		 *
738		 * R_008E20_SQ_STATIC_THREAD_MGMT1
739		 * R_008E24_SQ_STATIC_THREAD_MGMT2
740		 * R_008E28_SQ_STATIC_THREAD_MGMT3
741		 */
742
743		/* XXX: We may need to adjust the thread and stack resouce
744		 * values for 3D/compute interop */
745
746		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
747
748		/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
749		 * Set the number of threads used by the PS/VS/GS/ES stage to
750		 * 0.
751		 */
752		r600_store_value(cb, 0);
753
754		/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
755		 * Set the number of threads used by the CS (aka LS) stage to
756		 * the maximum number of threads and set the number of threads
757		 * for the HS stage to 0. */
758		r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
759
760		/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
761		 * Set the Control Flow stack entries to 0 for PS/VS stages */
762		r600_store_value(cb, 0);
763
764		/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
765		 * Set the Control Flow stack entries to 0 for GS/ES stages */
766		r600_store_value(cb, 0);
767
768		/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
769		 * Set the Contol Flow stack entries to 0 for the HS stage, and
770		 * set it to the maximum value for the CS (aka LS) stage. */
771		r600_store_value(cb,
772			S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
773	}
774
775	/* Context Registers */
776
777	if (ctx->chip_class < CAYMAN) {
778		/* workaround for hw issues with dyn gpr - must set all limits
779		 * to 240 instead of 0, 0x1e == 240 / 8
780		 */
781		r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
782				S_028838_PS_GPRS(0x1e) |
783				S_028838_VS_GPRS(0x1e) |
784				S_028838_GS_GPRS(0x1e) |
785				S_028838_ES_GPRS(0x1e) |
786				S_028838_HS_GPRS(0x1e) |
787				S_028838_LS_GPRS(0x1e));
788	}
789
790	/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
791	r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
792		S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
793
794	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
795
796	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
797						S_0286E8_TID_IN_GROUP_ENA
798						| S_0286E8_TGID_ENA
799						| S_0286E8_DISABLE_INDEX_PACK)
800						;
801
802	/* The LOOP_CONST registers are an optimizations for loops that allows
803	 * you to store the initial counter, increment value, and maximum
804	 * counter value in a register so that hardware can calculate the
805	 * correct number of iterations for the loop, so that you don't need
806	 * to have the loop counter in your shader code.  We don't currently use
807	 * this optimization, so we must keep track of the counter in the
808	 * shader and use a break instruction to exit loops.  However, the
809	 * hardware will still uses this register to determine when to exit a
810	 * loop, so we need to initialize the counter to 0, set the increment
811	 * value to 1 and the maximum counter value to the 4095 (0xfff) which
812	 * is the maximum value allowed.  This gives us a maximum of 4096
813	 * iterations for our loops, but hopefully our break instruction will
814	 * execute before some time before the 4096th iteration.
815	 */
816	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
817}
818
819void evergreen_init_compute_state_functions(struct r600_context *ctx)
820{
821	ctx->context.create_compute_state = evergreen_create_compute_state;
822	ctx->context.delete_compute_state = evergreen_delete_compute_state;
823	ctx->context.bind_compute_state = evergreen_bind_compute_state;
824//	 ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
825	ctx->context.set_compute_resources = evergreen_set_compute_resources;
826	ctx->context.set_compute_sampler_views = evergreen_set_cs_sampler_view;
827	ctx->context.bind_compute_sampler_states = evergreen_bind_compute_sampler_states;
828	ctx->context.set_global_binding = evergreen_set_global_binding;
829	ctx->context.launch_grid = evergreen_launch_grid;
830
831	/* We always use at least one vertex buffer for parameters (id = 1)*/
832	ctx->cs_vertex_buffer_state.enabled_mask =
833	ctx->cs_vertex_buffer_state.dirty_mask = 0x2;
834}
835
836
837struct pipe_resource *r600_compute_global_buffer_create(
838	struct pipe_screen *screen,
839	const struct pipe_resource *templ)
840{
841	struct r600_resource_global* result = NULL;
842	struct r600_screen* rscreen = NULL;
843	int size_in_dw = 0;
844
845	assert(templ->target == PIPE_BUFFER);
846	assert(templ->bind & PIPE_BIND_GLOBAL);
847	assert(templ->array_size == 1 || templ->array_size == 0);
848	assert(templ->depth0 == 1 || templ->depth0 == 0);
849	assert(templ->height0 == 1 || templ->height0 == 0);
850
851	result = (struct r600_resource_global*)
852	CALLOC(sizeof(struct r600_resource_global), 1);
853	rscreen = (struct r600_screen*)screen;
854
855	COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
856	COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
857			templ->array_size);
858
859	result->base.b.vtbl = &r600_global_buffer_vtbl;
860	result->base.b.b.screen = screen;
861	result->base.b.b = *templ;
862	pipe_reference_init(&result->base.b.b.reference, 1);
863
864	size_in_dw = (templ->width0+3) / 4;
865
866	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
867
868	if (result->chunk == NULL)
869	{
870		free(result);
871		return NULL;
872	}
873
874	return &result->base.b.b;
875}
876
877void r600_compute_global_buffer_destroy(
878	struct pipe_screen *screen,
879	struct pipe_resource *res)
880{
881	struct r600_resource_global* buffer = NULL;
882	struct r600_screen* rscreen = NULL;
883
884	assert(res->target == PIPE_BUFFER);
885	assert(res->bind & PIPE_BIND_GLOBAL);
886
887	buffer = (struct r600_resource_global*)res;
888	rscreen = (struct r600_screen*)screen;
889
890	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
891
892	buffer->chunk = NULL;
893	free(res);
894}
895
896void *r600_compute_global_transfer_map(
897	struct pipe_context *ctx_,
898	struct pipe_resource *resource,
899	unsigned level,
900	unsigned usage,
901	const struct pipe_box *box,
902	struct pipe_transfer **ptransfer)
903{
904	struct r600_context *rctx = (struct r600_context*)ctx_;
905	struct compute_memory_pool *pool = rctx->screen->global_pool;
906	struct pipe_transfer *transfer = util_slab_alloc(&rctx->pool_transfers);
907	struct r600_resource_global* buffer =
908		(struct r600_resource_global*)resource;
909	uint32_t* map;
910
911	compute_memory_finalize_pending(pool, ctx_);
912
913	assert(resource->target == PIPE_BUFFER);
914
915	COMPUTE_DBG(rctx->screen, "* r600_compute_global_get_transfer()\n"
916			"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
917			"width = %u, height = %u, depth = %u)\n", level, usage,
918			box->x, box->y, box->z, box->width, box->height,
919			box->depth);
920
921	transfer->resource = resource;
922	transfer->level = level;
923	transfer->usage = usage;
924	transfer->box = *box;
925	transfer->stride = 0;
926	transfer->layer_stride = 0;
927
928	assert(transfer->resource->target == PIPE_BUFFER);
929	assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
930	assert(transfer->box.x >= 0);
931	assert(transfer->box.y == 0);
932	assert(transfer->box.z == 0);
933
934	///TODO: do it better, mapping is not possible if the pool is too big
935
936	COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n");
937
938	if (!(map = r600_buffer_mmap_sync_with_rings(rctx, buffer->chunk->pool->bo, transfer->usage))) {
939		util_slab_free(&rctx->pool_transfers, transfer);
940		return NULL;
941	}
942
943	*ptransfer = transfer;
944
945	COMPUTE_DBG(rctx->screen, "Buffer: %p + %u (buffer offset in global memory) "
946		"+ %u (box.x)\n", map, buffer->chunk->start_in_dw, transfer->box.x);
947	return ((char*)(map + buffer->chunk->start_in_dw)) + transfer->box.x;
948}
949
950void r600_compute_global_transfer_unmap(
951	struct pipe_context *ctx_,
952	struct pipe_transfer* transfer)
953{
954	struct r600_context *ctx = NULL;
955	struct r600_resource_global* buffer = NULL;
956
957	assert(transfer->resource->target == PIPE_BUFFER);
958	assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
959
960	ctx = (struct r600_context *)ctx_;
961	buffer = (struct r600_resource_global*)transfer->resource;
962
963	COMPUTE_DBG(ctx->screen, "* r600_compute_global_transfer_unmap()\n");
964
965	ctx->ws->buffer_unmap(buffer->chunk->pool->bo->cs_buf);
966	util_slab_free(&ctx->pool_transfers, transfer);
967}
968
969void r600_compute_global_transfer_flush_region(
970	struct pipe_context *ctx_,
971	struct pipe_transfer *transfer,
972	const struct pipe_box *box)
973{
974	assert(0 && "TODO");
975}
976
977void r600_compute_global_transfer_inline_write(
978	struct pipe_context *pipe,
979	struct pipe_resource *resource,
980	unsigned level,
981	unsigned usage,
982	const struct pipe_box *box,
983	const void *data,
984	unsigned stride,
985	unsigned layer_stride)
986{
987	assert(0 && "TODO");
988}
989