evergreen_compute.c revision ec7d775790bef929b15e4c82d68ccaaf92c9f6b7
1/*
2 * Copyright 2011 Adam Rak <adam.rak@streamnovation.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 *      Adam Rak <adam.rak@streamnovation.com>
25 */
26
27#include <stdio.h>
28#include <errno.h>
29#include "pipe/p_defines.h"
30#include "pipe/p_state.h"
31#include "pipe/p_context.h"
32#include "util/u_blitter.h"
33#include "util/u_double_list.h"
34#include "util/u_transfer.h"
35#include "util/u_surface.h"
36#include "util/u_pack_color.h"
37#include "util/u_memory.h"
38#include "util/u_inlines.h"
39#include "util/u_framebuffer.h"
40#include "pipebuffer/pb_buffer.h"
41#include "r600.h"
42#include "evergreend.h"
43#include "r600_resource.h"
44#include "r600_shader.h"
45#include "r600_pipe.h"
46#include "r600_formats.h"
47#include "evergreen_compute.h"
48#include "evergreen_compute_internal.h"
49#include "compute_memory_pool.h"
50#ifdef HAVE_OPENCL
51#include "llvm_wrapper.h"
52#endif
53
54/**
55RAT0 is for global binding write
56VTX1 is for global binding read
57
58for wrting images RAT1...
59for reading images TEX2...
60  TEX2-RAT1 is paired
61
62TEX2... consumes the same fetch resources, that VTX2... would consume
63
64CONST0 and VTX0 is for parameters
65  CONST0 is binding smaller input parameter buffer, and for constant indexing,
66  also constant cached
67  VTX0 is for indirect/non-constant indexing, or if the input is bigger than
68  the constant cache can handle
69
70RAT-s are limited to 12, so we can only bind at most 11 texture for writing
71because we reserve RAT0 for global bindings. With byteaddressing enabled,
72we should reserve another one too.=> 10 image binding for writing max.
73
74from Nvidia OpenCL:
75  CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
76  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
77
78so 10 for writing is enough. 176 is the max for reading according to the docs
79
80writable images should be listed first < 10, so their id corresponds to RAT(id+1)
81writable images will consume TEX slots, VTX slots too because of linear indexing
82
83*/
84
85static void evergreen_cs_set_vertex_buffer(
86	struct r600_context * rctx,
87	unsigned vb_index,
88	unsigned offset,
89	struct pipe_resource * buffer)
90{
91	struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
92	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
93	vb->stride = 1;
94	vb->buffer_offset = offset;
95	vb->buffer = buffer;
96	vb->user_buffer = NULL;
97
98	/* The vertex instructions in the compute shaders use the texture cache,
99	 * so we need to invalidate it. */
100	rctx->flags |= R600_CONTEXT_INVAL_READ_CACHES;
101	state->enabled_mask |= 1 << vb_index;
102	state->dirty_mask |= 1 << vb_index;
103	state->atom.dirty = true;
104}
105
106static const struct u_resource_vtbl r600_global_buffer_vtbl =
107{
108	u_default_resource_get_handle, /* get_handle */
109	r600_compute_global_buffer_destroy, /* resource_destroy */
110	r600_compute_global_transfer_map, /* transfer_map */
111	r600_compute_global_transfer_flush_region,/* transfer_flush_region */
112	r600_compute_global_transfer_unmap, /* transfer_unmap */
113	r600_compute_global_transfer_inline_write /* transfer_inline_write */
114};
115
116
117void *evergreen_create_compute_state(
118	struct pipe_context *ctx_,
119	const const struct pipe_compute_state *cso)
120{
121	struct r600_context *ctx = (struct r600_context *)ctx_;
122	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
123
124#ifdef HAVE_OPENCL
125	const struct pipe_llvm_program_header * header;
126	const unsigned char * code;
127	unsigned i;
128
129	COMPUTE_DBG(ctx->screen, "*** evergreen_create_compute_state\n");
130
131	header = cso->prog;
132	code = cso->prog + sizeof(struct pipe_llvm_program_header);
133#endif
134
135	shader->ctx = (struct r600_context*)ctx;
136	shader->resources = (struct evergreen_compute_resource*)
137			CALLOC(sizeof(struct evergreen_compute_resource),
138			get_compute_resource_num());
139	shader->local_size = cso->req_local_mem; ///TODO: assert it
140	shader->private_size = cso->req_private_mem;
141	shader->input_size = cso->req_input_mem;
142
143#ifdef HAVE_OPENCL
144	shader->num_kernels = llvm_get_num_kernels(code, header->num_bytes);
145	shader->kernels = CALLOC(sizeof(struct r600_kernel), shader->num_kernels);
146
147	for (i = 0; i < shader->num_kernels; i++) {
148		struct r600_kernel *kernel = &shader->kernels[i];
149		kernel->llvm_module = llvm_get_kernel_module(i, code,
150							header->num_bytes);
151	}
152#endif
153	return shader;
154}
155
156void evergreen_delete_compute_state(struct pipe_context *ctx, void* state)
157{
158	struct r600_pipe_compute *shader = (struct r600_pipe_compute *)state;
159
160	free(shader->resources);
161	free(shader);
162}
163
164static void evergreen_bind_compute_state(struct pipe_context *ctx_, void *state)
165{
166	struct r600_context *ctx = (struct r600_context *)ctx_;
167
168	COMPUTE_DBG(ctx->screen, "*** evergreen_bind_compute_state\n");
169
170	ctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
171}
172
173/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
174 * kernel parameters there are inplicit parameters that need to be stored
175 * in the vertex buffer as well.  Here is how these parameters are organized in
176 * the buffer:
177 *
178 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
179 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
180 * DWORDS 6-8: Number of work items within each work group in each dimension
181 *             (x,y,z)
182 * DWORDS 9+ : Kernel parameters
183 */
184void evergreen_compute_upload_input(
185	struct pipe_context *ctx_,
186	const uint *block_layout,
187	const uint *grid_layout,
188	const void *input)
189{
190	struct r600_context *ctx = (struct r600_context *)ctx_;
191	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
192	int i;
193	unsigned kernel_parameters_offset_bytes = 36;
194	uint32_t * num_work_groups_start;
195	uint32_t * global_size_start;
196	uint32_t * local_size_start;
197	uint32_t * kernel_parameters_start;
198
199	if (shader->input_size == 0) {
200		return;
201	}
202
203	if (!shader->kernel_param) {
204		unsigned buffer_size = shader->input_size;
205
206		/* Add space for the grid dimensions */
207		buffer_size += kernel_parameters_offset_bytes * sizeof(uint);
208		shader->kernel_param = r600_compute_buffer_alloc_vram(
209						ctx->screen, buffer_size);
210	}
211
212	num_work_groups_start = r600_buffer_mmap_sync_with_rings(ctx, shader->kernel_param, PIPE_TRANSFER_WRITE);
213	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
214	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
215	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
216
217	/* Copy the work group size */
218	memcpy(num_work_groups_start, grid_layout, 3 * sizeof(uint));
219
220	/* Copy the global size */
221	for (i = 0; i < 3; i++) {
222		global_size_start[i] = grid_layout[i] * block_layout[i];
223	}
224
225	/* Copy the local dimensions */
226	memcpy(local_size_start, block_layout, 3 * sizeof(uint));
227
228	/* Copy the kernel inputs */
229	memcpy(kernel_parameters_start, input, shader->input_size);
230
231	for (i = 0; i < (kernel_parameters_offset_bytes / 4) +
232					(shader->input_size / 4); i++) {
233		COMPUTE_DBG(ctx->screen, "input %i : %i\n", i,
234			((unsigned*)num_work_groups_start)[i]);
235	}
236
237	ctx->ws->buffer_unmap(shader->kernel_param->cs_buf);
238
239	///ID=0 is reserved for the parameters
240	evergreen_cs_set_vertex_buffer(ctx, 0, 0,
241			(struct pipe_resource*)shader->kernel_param);
242	///ID=0 is reserved for parameters
243	evergreen_set_const_cache(shader, 0, shader->kernel_param,
244						shader->input_size, 0);
245}
246
247static void evergreen_emit_direct_dispatch(
248		struct r600_context *rctx,
249		const uint *block_layout, const uint *grid_layout)
250{
251	int i;
252	struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
253	unsigned num_waves;
254	unsigned num_pipes = rctx->screen->info.r600_max_pipes;
255	unsigned wave_divisor = (16 * num_pipes);
256	int group_size = 1;
257	int grid_size = 1;
258	/* XXX: Enable lds and get size from cs_shader_state */
259	unsigned lds_size = 0;
260
261	/* Calculate group_size/grid_size */
262	for (i = 0; i < 3; i++) {
263		group_size *= block_layout[i];
264	}
265
266	for (i = 0; i < 3; i++)	{
267		grid_size *= grid_layout[i];
268	}
269
270	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
271	num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
272			wave_divisor - 1) / wave_divisor;
273
274	COMPUTE_DBG(rctx->screen, "Using %u pipes, there are %u wavefronts per thread block\n",
275							num_pipes, num_waves);
276
277	/* XXX: Partition the LDS between PS/CS.  By default half (4096 dwords
278	 * on Evergreen) oes to Pixel Shaders and half goes to Compute Shaders.
279	 * We may need to allocat the entire LDS space for Compute Shaders.
280	 *
281	 * EG: R_008E2C_SQ_LDS_RESOURCE_MGMT := S_008E2C_NUM_LS_LDS(lds_dwords)
282	 * CM: CM_R_0286FC_SPI_LDS_MGMT :=  S_0286FC_NUM_LS_LDS(lds_dwords)
283	 */
284
285	r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
286
287	r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
288	r600_write_value(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
289	r600_write_value(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
290	r600_write_value(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
291
292	r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
293								group_size);
294
295	r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
296	r600_write_value(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
297	r600_write_value(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
298	r600_write_value(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
299
300	r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC,
301					lds_size | (num_waves << 14));
302
303	/* Dispatch packet */
304	r600_write_value(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
305	r600_write_value(cs, grid_layout[0]);
306	r600_write_value(cs, grid_layout[1]);
307	r600_write_value(cs, grid_layout[2]);
308	/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
309	r600_write_value(cs, 1);
310}
311
312static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
313		const uint *grid_layout)
314{
315	struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
316	unsigned flush_flags = 0;
317	int i;
318	struct r600_resource *onebo = NULL;
319	struct evergreen_compute_resource *resources =
320					ctx->cs_shader_state.shader->resources;
321
322	/* make sure that the gfx ring is only one active */
323	if (ctx->rings.dma.cs) {
324		ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC);
325	}
326
327	/* Initialize all the compute-related registers.
328	 *
329	 * See evergreen_init_atom_start_compute_cs() in this file for the list
330	 * of registers initialized by the start_compute_cs_cmd atom.
331	 */
332	r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);
333
334	ctx->flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
335	r600_flush_emit(ctx);
336
337	/* Emit colorbuffers. */
338	for (i = 0; i < ctx->framebuffer.state.nr_cbufs; i++) {
339		struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
340		unsigned reloc = r600_context_bo_reloc(ctx, &ctx->rings.gfx,
341						       (struct r600_resource*)cb->base.texture,
342						       RADEON_USAGE_READWRITE);
343
344		r600_write_compute_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
345		r600_write_value(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
346		r600_write_value(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
347		r600_write_value(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
348		r600_write_value(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
349		r600_write_value(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
350		r600_write_value(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
351		r600_write_value(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */
352
353		r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
354		r600_write_value(cs, reloc);
355
356		if (!ctx->keep_tiling_flags) {
357			r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
358			r600_write_value(cs, reloc);
359		}
360
361		r600_write_value(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
362		r600_write_value(cs, reloc);
363	}
364
365	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
366	r600_write_compute_context_reg(cs, R_028238_CB_TARGET_MASK,
367					ctx->compute_cb_target_mask);
368
369
370	/* Emit vertex buffer state */
371	ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
372	r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);
373
374	/* Emit compute shader state */
375	r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
376
377	for (i = 0; i < get_compute_resource_num(); i++) {
378		if (resources[i].enabled) {
379			int j;
380			COMPUTE_DBG(ctx->screen, "resnum: %i, cdw: %i\n", i, cs->cdw);
381
382			for (j = 0; j < resources[i].cs_end; j++) {
383				if (resources[i].do_reloc[j]) {
384					assert(resources[i].bo);
385					evergreen_emit_ctx_reloc(ctx,
386						resources[i].bo,
387						resources[i].usage);
388				}
389
390				cs->buf[cs->cdw++] = resources[i].cs[j];
391			}
392
393			if (resources[i].bo) {
394				onebo = resources[i].bo;
395				evergreen_emit_ctx_reloc(ctx,
396					resources[i].bo,
397					resources[i].usage);
398
399				///special case for textures
400				if (resources[i].do_reloc
401					[resources[i].cs_end] == 2) {
402					evergreen_emit_ctx_reloc(ctx,
403						resources[i].bo,
404						resources[i].usage);
405				}
406			}
407		}
408	}
409
410	/* Emit dispatch state and dispatch packet */
411	evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);
412
413	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
414	 */
415	ctx->flags |= R600_CONTEXT_INVAL_READ_CACHES;
416	r600_flush_emit(ctx);
417
418#if 0
419	COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
420	for (i = 0; i < cs->cdw; i++) {
421		COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, ctx->cs->buf[i]);
422	}
423#endif
424
425	flush_flags = RADEON_FLUSH_ASYNC | RADEON_FLUSH_COMPUTE;
426	if (ctx->keep_tiling_flags) {
427		flush_flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
428	}
429
430	ctx->ws->cs_flush(ctx->rings.gfx.cs, flush_flags);
431
432	ctx->flags = 0;
433
434	COMPUTE_DBG(ctx->screen, "shader started\n");
435
436	ctx->ws->buffer_wait(onebo->buf, 0);
437
438	COMPUTE_DBG(ctx->screen, "...\n");
439}
440
441
442/**
443 * Emit function for r600_cs_shader_state atom
444 */
445void evergreen_emit_cs_shader(
446		struct r600_context *rctx,
447		struct r600_atom *atom)
448{
449	struct r600_cs_shader_state *state =
450					(struct r600_cs_shader_state*)atom;
451	struct r600_pipe_compute *shader = state->shader;
452	struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
453	struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
454	uint64_t va;
455
456	va = r600_resource_va(&rctx->screen->screen, &kernel->code_bo->b.b);
457
458	r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
459	r600_write_value(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
460	r600_write_value(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
461			S_0288D4_NUM_GPRS(kernel->bc.ngpr)
462			| S_0288D4_STACK_SIZE(kernel->bc.nstack));
463	r600_write_value(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
464
465	r600_write_value(cs, PKT3C(PKT3_NOP, 0, 0));
466	r600_write_value(cs, r600_context_bo_reloc(rctx, &rctx->rings.gfx,
467							kernel->code_bo, RADEON_USAGE_READ));
468
469	rctx->flags |= R600_CONTEXT_INVAL_READ_CACHES;
470}
471
472static void evergreen_launch_grid(
473		struct pipe_context *ctx_,
474		const uint *block_layout, const uint *grid_layout,
475		uint32_t pc, const void *input)
476{
477	struct r600_context *ctx = (struct r600_context *)ctx_;
478
479#ifdef HAVE_OPENCL
480	COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc);
481
482	struct r600_pipe_compute *shader = ctx->cs_shader_state.shader;
483	if (!shader->kernels[pc].code_bo) {
484		void *p;
485		struct r600_kernel *kernel = &shader->kernels[pc];
486		r600_compute_shader_create(ctx_, kernel->llvm_module, &kernel->bc);
487		kernel->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
488							kernel->bc.ndw * 4);
489		p = r600_buffer_mmap_sync_with_rings(ctx, kernel->code_bo, PIPE_TRANSFER_WRITE);
490		memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
491		ctx->ws->buffer_unmap(kernel->code_bo->cs_buf);
492	}
493#endif
494
495	ctx->cs_shader_state.kernel_index = pc;
496	evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input);
497	compute_emit_cs(ctx, block_layout, grid_layout);
498}
499
500static void evergreen_set_compute_resources(struct pipe_context * ctx_,
501		unsigned start, unsigned count,
502		struct pipe_surface ** surfaces)
503{
504	struct r600_context *ctx = (struct r600_context *)ctx_;
505	struct r600_surface **resources = (struct r600_surface **)surfaces;
506
507	COMPUTE_DBG(ctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
508			start, count);
509
510	for (int i = 0; i < count; i++)	{
511		/* The First two vertex buffers are reserved for parameters and
512		 * global buffers. */
513		unsigned vtx_id = 2 + i;
514		if (resources[i]) {
515			struct r600_resource_global *buffer =
516				(struct r600_resource_global*)
517				resources[i]->base.texture;
518			if (resources[i]->base.writable) {
519				assert(i+1 < 12);
520
521				evergreen_set_rat(ctx->cs_shader_state.shader, i+1,
522				(struct r600_resource *)resources[i]->base.texture,
523				buffer->chunk->start_in_dw*4,
524				resources[i]->base.texture->width0);
525			}
526
527			evergreen_cs_set_vertex_buffer(ctx, vtx_id,
528					buffer->chunk->start_in_dw * 4,
529					resources[i]->base.texture);
530		}
531	}
532}
533
534static void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
535		unsigned start_slot, unsigned count,
536		struct pipe_sampler_view **views)
537{
538	struct r600_context *ctx = (struct r600_context *)ctx_;
539	struct r600_pipe_sampler_view **resource =
540		(struct r600_pipe_sampler_view **)views;
541
542	for (int i = 0; i < count; i++)	{
543		if (resource[i]) {
544			assert(i+1 < 12);
545			///FETCH0 = VTX0 (param buffer),
546			//FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
547			evergreen_set_tex_resource(ctx->cs_shader_state.shader, resource[i], i+2);
548		}
549	}
550}
551
552static void evergreen_bind_compute_sampler_states(
553	struct pipe_context *ctx_,
554	unsigned start_slot,
555	unsigned num_samplers,
556	void **samplers_)
557{
558	struct r600_context *ctx = (struct r600_context *)ctx_;
559	struct compute_sampler_state ** samplers =
560		(struct compute_sampler_state **)samplers_;
561
562	for (int i = 0; i < num_samplers; i++) {
563		if (samplers[i]) {
564			evergreen_set_sampler_resource(
565				ctx->cs_shader_state.shader, samplers[i], i);
566		}
567	}
568}
569
570static void evergreen_set_global_binding(
571	struct pipe_context *ctx_, unsigned first, unsigned n,
572	struct pipe_resource **resources,
573	uint32_t **handles)
574{
575	struct r600_context *ctx = (struct r600_context *)ctx_;
576	struct compute_memory_pool *pool = ctx->screen->global_pool;
577	struct r600_resource_global **buffers =
578		(struct r600_resource_global **)resources;
579
580	COMPUTE_DBG(ctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
581			first, n);
582
583	if (!resources) {
584		/* XXX: Unset */
585		return;
586	}
587
588	compute_memory_finalize_pending(pool, ctx_);
589
590	for (int i = 0; i < n; i++)
591	{
592		assert(resources[i]->target == PIPE_BUFFER);
593		assert(resources[i]->bind & PIPE_BIND_GLOBAL);
594
595		*(handles[i]) = buffers[i]->chunk->start_in_dw * 4;
596	}
597
598	evergreen_set_rat(ctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
599	evergreen_cs_set_vertex_buffer(ctx, 1, 0,
600				(struct pipe_resource*)pool->bo);
601}
602
603/**
604 * This function initializes all the compute specific registers that need to
605 * be initialized for each compute command stream.  Registers that are common
606 * to both compute and 3D will be initialized at the beginning of each compute
607 * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
608 * packet requires that the shader type bit be set, we must initialize all
609 * context registers needed for compute in this function.  The registers
610 * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
611 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
612 * on the GPU family.
613 */
614void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
615{
616	struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
617	int num_threads;
618	int num_stack_entries;
619
620	/* since all required registers are initialised in the
621	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
622	 */
623	r600_init_command_buffer(cb, 256);
624	cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
625
626	/* This must be first. */
627	r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
628	r600_store_value(cb, 0x80000000);
629	r600_store_value(cb, 0x80000000);
630
631	/* We're setting config registers here. */
632	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
633	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
634
635	switch (ctx->family) {
636	case CHIP_CEDAR:
637	default:
638		num_threads = 128;
639		num_stack_entries = 256;
640		break;
641	case CHIP_REDWOOD:
642		num_threads = 128;
643		num_stack_entries = 256;
644		break;
645	case CHIP_JUNIPER:
646		num_threads = 128;
647		num_stack_entries = 512;
648		break;
649	case CHIP_CYPRESS:
650	case CHIP_HEMLOCK:
651		num_threads = 128;
652		num_stack_entries = 512;
653		break;
654	case CHIP_PALM:
655		num_threads = 128;
656		num_stack_entries = 256;
657		break;
658	case CHIP_SUMO:
659		num_threads = 128;
660		num_stack_entries = 256;
661		break;
662	case CHIP_SUMO2:
663		num_threads = 128;
664		num_stack_entries = 512;
665		break;
666	case CHIP_BARTS:
667		num_threads = 128;
668		num_stack_entries = 512;
669		break;
670	case CHIP_TURKS:
671		num_threads = 128;
672		num_stack_entries = 256;
673		break;
674	case CHIP_CAICOS:
675		num_threads = 128;
676		num_stack_entries = 256;
677		break;
678	}
679
680	/* Config Registers */
681	if (ctx->chip_class < CAYMAN)
682		evergreen_init_common_regs(cb, ctx->chip_class, ctx->family,
683					   ctx->screen->info.drm_minor);
684	else
685		cayman_init_common_regs(cb, ctx->chip_class, ctx->family,
686					ctx->screen->info.drm_minor);
687
688	/* The primitive type always needs to be POINTLIST for compute. */
689	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
690						V_008958_DI_PT_POINTLIST);
691
692	if (ctx->chip_class < CAYMAN) {
693
694		/* These registers control which simds can be used by each stage.
695		 * The default for these registers is 0xffffffff, which means
696		 * all simds are available for each stage.  It's possible we may
697		 * want to play around with these in the future, but for now
698		 * the default value is fine.
699		 *
700		 * R_008E20_SQ_STATIC_THREAD_MGMT1
701		 * R_008E24_SQ_STATIC_THREAD_MGMT2
702		 * R_008E28_SQ_STATIC_THREAD_MGMT3
703		 */
704
705		/* XXX: We may need to adjust the thread and stack resouce
706		 * values for 3D/compute interop */
707
708		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
709
710		/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
711		 * Set the number of threads used by the PS/VS/GS/ES stage to
712		 * 0.
713		 */
714		r600_store_value(cb, 0);
715
716		/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
717		 * Set the number of threads used by the CS (aka LS) stage to
718		 * the maximum number of threads and set the number of threads
719		 * for the HS stage to 0. */
720		r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
721
722		/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
723		 * Set the Control Flow stack entries to 0 for PS/VS stages */
724		r600_store_value(cb, 0);
725
726		/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
727		 * Set the Control Flow stack entries to 0 for GS/ES stages */
728		r600_store_value(cb, 0);
729
730		/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
731		 * Set the Contol Flow stack entries to 0 for the HS stage, and
732		 * set it to the maximum value for the CS (aka LS) stage. */
733		r600_store_value(cb,
734			S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
735	}
736
737	/* Context Registers */
738
739	if (ctx->chip_class < CAYMAN) {
740		/* workaround for hw issues with dyn gpr - must set all limits
741		 * to 240 instead of 0, 0x1e == 240 / 8
742		 */
743		r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
744				S_028838_PS_GPRS(0x1e) |
745				S_028838_VS_GPRS(0x1e) |
746				S_028838_GS_GPRS(0x1e) |
747				S_028838_ES_GPRS(0x1e) |
748				S_028838_HS_GPRS(0x1e) |
749				S_028838_LS_GPRS(0x1e));
750	}
751
752	/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
753	r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
754		S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
755
756	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
757
758	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
759						S_0286E8_TID_IN_GROUP_ENA
760						| S_0286E8_TGID_ENA
761						| S_0286E8_DISABLE_INDEX_PACK)
762						;
763
764	/* The LOOP_CONST registers are an optimizations for loops that allows
765	 * you to store the initial counter, increment value, and maximum
766	 * counter value in a register so that hardware can calculate the
767	 * correct number of iterations for the loop, so that you don't need
768	 * to have the loop counter in your shader code.  We don't currently use
769	 * this optimization, so we must keep track of the counter in the
770	 * shader and use a break instruction to exit loops.  However, the
771	 * hardware will still uses this register to determine when to exit a
772	 * loop, so we need to initialize the counter to 0, set the increment
773	 * value to 1 and the maximum counter value to the 4095 (0xfff) which
774	 * is the maximum value allowed.  This gives us a maximum of 4096
775	 * iterations for our loops, but hopefully our break instruction will
776	 * execute before some time before the 4096th iteration.
777	 */
778	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
779}
780
781void evergreen_init_compute_state_functions(struct r600_context *ctx)
782{
783	ctx->context.create_compute_state = evergreen_create_compute_state;
784	ctx->context.delete_compute_state = evergreen_delete_compute_state;
785	ctx->context.bind_compute_state = evergreen_bind_compute_state;
786//	 ctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
787	ctx->context.set_compute_resources = evergreen_set_compute_resources;
788	ctx->context.set_compute_sampler_views = evergreen_set_cs_sampler_view;
789	ctx->context.bind_compute_sampler_states = evergreen_bind_compute_sampler_states;
790	ctx->context.set_global_binding = evergreen_set_global_binding;
791	ctx->context.launch_grid = evergreen_launch_grid;
792
793	/* We always use at least two vertex buffers for compute, one for
794         * parameters and one for global memory */
795	ctx->cs_vertex_buffer_state.enabled_mask =
796	ctx->cs_vertex_buffer_state.dirty_mask = 1 | 2;
797}
798
799
800struct pipe_resource *r600_compute_global_buffer_create(
801	struct pipe_screen *screen,
802	const struct pipe_resource *templ)
803{
804	struct r600_resource_global* result = NULL;
805	struct r600_screen* rscreen = NULL;
806	int size_in_dw = 0;
807
808	assert(templ->target == PIPE_BUFFER);
809	assert(templ->bind & PIPE_BIND_GLOBAL);
810	assert(templ->array_size == 1 || templ->array_size == 0);
811	assert(templ->depth0 == 1 || templ->depth0 == 0);
812	assert(templ->height0 == 1 || templ->height0 == 0);
813
814	result = (struct r600_resource_global*)
815	CALLOC(sizeof(struct r600_resource_global), 1);
816	rscreen = (struct r600_screen*)screen;
817
818	COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
819	COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
820			templ->array_size);
821
822	result->base.b.vtbl = &r600_global_buffer_vtbl;
823	result->base.b.b.screen = screen;
824	result->base.b.b = *templ;
825	pipe_reference_init(&result->base.b.b.reference, 1);
826
827	size_in_dw = (templ->width0+3) / 4;
828
829	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
830
831	if (result->chunk == NULL)
832	{
833		free(result);
834		return NULL;
835	}
836
837	return &result->base.b.b;
838}
839
840void r600_compute_global_buffer_destroy(
841	struct pipe_screen *screen,
842	struct pipe_resource *res)
843{
844	struct r600_resource_global* buffer = NULL;
845	struct r600_screen* rscreen = NULL;
846
847	assert(res->target == PIPE_BUFFER);
848	assert(res->bind & PIPE_BIND_GLOBAL);
849
850	buffer = (struct r600_resource_global*)res;
851	rscreen = (struct r600_screen*)screen;
852
853	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
854
855	buffer->chunk = NULL;
856	free(res);
857}
858
859void *r600_compute_global_transfer_map(
860	struct pipe_context *ctx_,
861	struct pipe_resource *resource,
862	unsigned level,
863	unsigned usage,
864	const struct pipe_box *box,
865	struct pipe_transfer **ptransfer)
866{
867	struct r600_context *rctx = (struct r600_context*)ctx_;
868	struct compute_memory_pool *pool = rctx->screen->global_pool;
869	struct pipe_transfer *transfer = util_slab_alloc(&rctx->pool_transfers);
870	struct r600_resource_global* buffer =
871		(struct r600_resource_global*)resource;
872	uint32_t* map;
873
874	compute_memory_finalize_pending(pool, ctx_);
875
876	assert(resource->target == PIPE_BUFFER);
877
878	COMPUTE_DBG(rctx->screen, "* r600_compute_global_get_transfer()\n"
879			"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
880			"width = %u, height = %u, depth = %u)\n", level, usage,
881			box->x, box->y, box->z, box->width, box->height,
882			box->depth);
883
884	transfer->resource = resource;
885	transfer->level = level;
886	transfer->usage = usage;
887	transfer->box = *box;
888	transfer->stride = 0;
889	transfer->layer_stride = 0;
890
891	assert(transfer->resource->target == PIPE_BUFFER);
892	assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
893	assert(transfer->box.x >= 0);
894	assert(transfer->box.y == 0);
895	assert(transfer->box.z == 0);
896
897	///TODO: do it better, mapping is not possible if the pool is too big
898
899	COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n");
900
901	if (!(map = r600_buffer_mmap_sync_with_rings(rctx, buffer->chunk->pool->bo, transfer->usage))) {
902		util_slab_free(&rctx->pool_transfers, transfer);
903		return NULL;
904	}
905
906	*ptransfer = transfer;
907
908	COMPUTE_DBG(rctx->screen, "Buffer: %p + %u (buffer offset in global memory) "
909		"+ %u (box.x)\n", map, buffer->chunk->start_in_dw, transfer->box.x);
910	return ((char*)(map + buffer->chunk->start_in_dw)) + transfer->box.x;
911}
912
913void r600_compute_global_transfer_unmap(
914	struct pipe_context *ctx_,
915	struct pipe_transfer* transfer)
916{
917	struct r600_context *ctx = NULL;
918	struct r600_resource_global* buffer = NULL;
919
920	assert(transfer->resource->target == PIPE_BUFFER);
921	assert(transfer->resource->bind & PIPE_BIND_GLOBAL);
922
923	ctx = (struct r600_context *)ctx_;
924	buffer = (struct r600_resource_global*)transfer->resource;
925
926	COMPUTE_DBG(ctx->screen, "* r600_compute_global_transfer_unmap()\n");
927
928	ctx->ws->buffer_unmap(buffer->chunk->pool->bo->cs_buf);
929	util_slab_free(&ctx->pool_transfers, transfer);
930}
931
932void r600_compute_global_transfer_flush_region(
933	struct pipe_context *ctx_,
934	struct pipe_transfer *transfer,
935	const struct pipe_box *box)
936{
937	assert(0 && "TODO");
938}
939
940void r600_compute_global_transfer_inline_write(
941	struct pipe_context *pipe,
942	struct pipe_resource *resource,
943	unsigned level,
944	unsigned usage,
945	const struct pipe_box *box,
946	const void *data,
947	unsigned stride,
948	unsigned layer_stride)
949{
950	assert(0 && "TODO");
951}
952