si_shader.h revision 4636d9be4a40138d0a10cadcb1b63eea89d95e34
1/*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 *	Tom Stellard <thomas.stellard@amd.com>
25 *	Michel Dänzer <michel.daenzer@amd.com>
26 *      Christian König <christian.koenig@amd.com>
27 */
28
29/* How linking shader inputs and outputs between vertex, tessellation, and
30 * geometry shaders works.
31 *
32 * Inputs and outputs between shaders are stored in a buffer. This buffer
33 * lives in LDS (typical case for tessellation), but it can also live
34 * in memory (ESGS). Each input or output has a fixed location within a vertex.
35 * The highest used input or output determines the stride between vertices.
36 *
37 * Since GS and tessellation are only possible in the OpenGL core profile,
38 * only these semantics are valid for per-vertex data:
39 *
40 *   Name             Location
41 *
42 *   POSITION         0
43 *   PSIZE            1
44 *   CLIPDIST0..1     2..3
45 *   CULLDIST0..1     (not implemented)
46 *   GENERIC0..31     4..35
47 *
48 * For example, a shader only writing GENERIC0 has the output stride of 5.
49 *
50 * Only these semantics are valid for per-patch data:
51 *
52 *   Name             Location
53 *
54 *   TESSOUTER        0
55 *   TESSINNER        1
56 *   PATCH0..29       2..31
57 *
58 * That's how independent shaders agree on input and output locations.
59 * The si_shader_io_get_unique_index function assigns the locations.
60 *
61 * For tessellation, other required information for calculating the input and
62 * output addresses like the vertex stride, the patch stride, and the offsets
63 * where per-vertex and per-patch data start, is passed to the shader via
64 * user data SGPRs. The offsets and strides are calculated at draw time and
65 * aren't available at compile time.
66 */
67
68#ifndef SI_SHADER_H
69#define SI_SHADER_H
70
71#include <llvm-c/Core.h> /* LLVMModuleRef */
72#include "tgsi/tgsi_scan.h"
73#include "si_state.h"
74
75struct radeon_shader_binary;
76struct radeon_shader_reloc;
77
78#define SI_SGPR_RW_BUFFERS	0  /* rings (& stream-out, VS only) */
79#define SI_SGPR_CONST_BUFFERS	2
80#define SI_SGPR_SAMPLERS	4  /* images & sampler states interleaved */
81/* TODO: gap */
82#define SI_SGPR_VERTEX_BUFFERS	8  /* VS only */
83#define SI_SGPR_BASE_VERTEX	10 /* VS only */
84#define SI_SGPR_START_INSTANCE	11 /* VS only */
85#define SI_SGPR_VS_STATE_BITS	12 /* VS(VS) only */
86#define SI_SGPR_LS_OUT_LAYOUT	12 /* VS(LS) only */
87#define SI_SGPR_TCS_OUT_OFFSETS	8  /* TCS & TES only */
88#define SI_SGPR_TCS_OUT_LAYOUT	9  /* TCS & TES only */
89#define SI_SGPR_TCS_IN_LAYOUT	10 /* TCS only */
90#define SI_SGPR_ALPHA_REF	8  /* PS only */
91
92#define SI_VS_NUM_USER_SGPR	13 /* API VS */
93#define SI_ES_NUM_USER_SGPR	12 /* API VS */
94#define SI_LS_NUM_USER_SGPR	13 /* API VS */
95#define SI_TCS_NUM_USER_SGPR	11
96#define SI_TES_NUM_USER_SGPR	10
97#define SI_GS_NUM_USER_SGPR	8
98#define SI_GSCOPY_NUM_USER_SGPR	4
99#define SI_PS_NUM_USER_SGPR	9
100
101/* LLVM function parameter indices */
102#define SI_PARAM_RW_BUFFERS	0
103#define SI_PARAM_CONST_BUFFERS	1
104#define SI_PARAM_SAMPLERS	2
105#define SI_PARAM_UNUSED		3 /* TODO: use */
106
107/* VS only parameters */
108#define SI_PARAM_VERTEX_BUFFERS	4
109#define SI_PARAM_BASE_VERTEX	5
110#define SI_PARAM_START_INSTANCE	6
111/* [0] = clamp vertex color */
112#define SI_PARAM_VS_STATE_BITS	7
113/* the other VS parameters are assigned dynamically */
114
115/* Offsets where TCS outputs and TCS patch outputs live in LDS:
116 *   [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32
117 *   [16:31] = TCS output patch0 offset for per-patch / 16, max = NUM_PATCHES*32*32* + 32*32
118 */
119#define SI_PARAM_TCS_OUT_OFFSETS 4 /* for TCS & TES */
120
121/* Layout of TCS outputs / TES inputs:
122 *   [0:12] = stride between output patches in dwords, num_outputs * num_vertices * 4, max = 32*32*4
123 *   [13:20] = stride between output vertices in dwords = num_inputs * 4, max = 32*4
124 *   [26:31] = gl_PatchVerticesIn, max = 32
125 */
126#define SI_PARAM_TCS_OUT_LAYOUT	5 /* for TCS & TES */
127
128/* Layout of LS outputs / TCS inputs
129 *   [0:12] = stride between patches in dwords = num_inputs * num_vertices * 4, max = 32*32*4
130 *   [13:20] = stride between vertices in dwords = num_inputs * 4, max = 32*4
131 */
132#define SI_PARAM_TCS_IN_LAYOUT	6 /* TCS only */
133#define SI_PARAM_LS_OUT_LAYOUT	7 /* same value as TCS_IN_LAYOUT, LS only */
134
135/* TCS only parameters. */
136#define SI_PARAM_TESS_FACTOR_OFFSET 7
137#define SI_PARAM_PATCH_ID	8
138#define SI_PARAM_REL_IDS	9
139
140/* GS only parameters */
141#define SI_PARAM_GS2VS_OFFSET	4
142#define SI_PARAM_GS_WAVE_ID	5
143#define SI_PARAM_VTX0_OFFSET	6
144#define SI_PARAM_VTX1_OFFSET	7
145#define SI_PARAM_PRIMITIVE_ID	8
146#define SI_PARAM_VTX2_OFFSET	9
147#define SI_PARAM_VTX3_OFFSET	10
148#define SI_PARAM_VTX4_OFFSET	11
149#define SI_PARAM_VTX5_OFFSET	12
150#define SI_PARAM_GS_INSTANCE_ID	13
151
152/* PS only parameters */
153#define SI_PARAM_ALPHA_REF		4
154#define SI_PARAM_PRIM_MASK		5
155#define SI_PARAM_PERSP_SAMPLE		6
156#define SI_PARAM_PERSP_CENTER		7
157#define SI_PARAM_PERSP_CENTROID		8
158#define SI_PARAM_PERSP_PULL_MODEL	9
159#define SI_PARAM_LINEAR_SAMPLE		10
160#define SI_PARAM_LINEAR_CENTER		11
161#define SI_PARAM_LINEAR_CENTROID	12
162#define SI_PARAM_LINE_STIPPLE_TEX	13
163#define SI_PARAM_POS_X_FLOAT		14
164#define SI_PARAM_POS_Y_FLOAT		15
165#define SI_PARAM_POS_Z_FLOAT		16
166#define SI_PARAM_POS_W_FLOAT		17
167#define SI_PARAM_FRONT_FACE		18
168#define SI_PARAM_ANCILLARY		19
169#define SI_PARAM_SAMPLE_COVERAGE	20
170#define SI_PARAM_POS_FIXED_PT		21
171
172#define SI_NUM_PARAMS (SI_PARAM_POS_FIXED_PT + 9) /* +8 for COLOR[0..1] */
173
174struct si_shader;
175
176/* A shader selector is a gallium CSO and contains shader variants and
177 * binaries for one TGSI program. This can be shared by multiple contexts.
178 */
179struct si_shader_selector {
180	pipe_mutex		mutex;
181	struct si_shader	*first_variant; /* immutable after the first variant */
182	struct si_shader	*last_variant; /* mutable */
183
184	struct tgsi_token       *tokens;
185	struct pipe_stream_output_info  so;
186	struct tgsi_shader_info		info;
187
188	/* PIPE_SHADER_[VERTEX|FRAGMENT|...] */
189	unsigned	type;
190
191	/* GS parameters. */
192	unsigned	esgs_itemsize;
193	unsigned	gs_input_verts_per_prim;
194	unsigned	gs_output_prim;
195	unsigned	gs_max_out_vertices;
196	unsigned	gs_num_invocations;
197	unsigned	max_gs_stream; /* count - 1 */
198	unsigned	gsvs_vertex_size;
199	unsigned	max_gsvs_emit_size;
200
201	/* PS parameters. */
202	unsigned	color_attr_index[2];
203	unsigned	db_shader_control;
204	/* Set 0xf or 0x0 (4 bits) per each written output.
205	 * ANDed with spi_shader_col_format.
206	 */
207	unsigned	colors_written_4bit;
208
209	/* masks of "get_unique_index" bits */
210	uint64_t	outputs_written;
211	uint32_t	patch_outputs_written;
212};
213
214/* Valid shader configurations:
215 *
216 * API shaders       VS | TCS | TES | GS |pass| PS
217 * are compiled as:     |     |     |    |thru|
218 *                      |     |     |    |    |
219 * Only VS & PS:     VS | --  | --  | -- | -- | PS
220 * With GS:          ES | --  | --  | GS | VS | PS
221 * With Tessel.:     LS | HS  | VS  | -- | -- | PS
222 * With both:        LS | HS  | ES  | GS | VS | PS
223 */
224
225/* Common VS bits between the shader key and the prolog key. */
226struct si_vs_prolog_bits {
227	unsigned	instance_divisors[SI_NUM_VERTEX_BUFFERS];
228};
229
230/* Common VS bits between the shader key and the epilog key. */
231struct si_vs_epilog_bits {
232	unsigned	export_prim_id:1; /* when PS needs it and GS is disabled */
233	/* TODO:
234	 * - skip clipdist, culldist (including clipvertex code) exports based
235	 *   on which clip_plane_enable bits are set
236	 * - skip layer, viewport, clipdist, and culldist parameter exports
237	 *   if PS doesn't read them
238	 */
239};
240
241/* Common TCS bits between the shader key and the epilog key. */
242struct si_tcs_epilog_bits {
243	unsigned	prim_mode:3;
244};
245
246/* Common PS bits between the shader key and the prolog key. */
247struct si_ps_prolog_bits {
248	unsigned	color_two_side:1;
249	/* TODO: add a flatshade bit that skips interpolation for colors */
250	unsigned	poly_stipple:1;
251	unsigned	force_persample_interp:1;
252	/* TODO:
253	 * - add force_center_interp if MSAA is disabled and centroid or
254	 *   sample are present
255	 * - add force_center_interp_bc_optimize to force center interpolation
256	 *   based on the bc_optimize SGPR bit if MSAA is enabled, centroid is
257	 *   present and sample isn't present.
258	 */
259};
260
261/* Common PS bits between the shader key and the epilog key. */
262struct si_ps_epilog_bits {
263	unsigned	spi_shader_col_format;
264	unsigned	color_is_int8:8;
265	unsigned	last_cbuf:3;
266	unsigned	alpha_func:3;
267	unsigned	alpha_to_one:1;
268	unsigned	poly_line_smoothing:1;
269	unsigned	clamp_color:1;
270};
271
272union si_shader_part_key {
273	struct {
274		struct si_vs_prolog_bits states;
275		unsigned	num_input_sgprs:5;
276		unsigned	last_input:4;
277	} vs_prolog;
278	struct {
279		struct si_vs_epilog_bits states;
280		unsigned	prim_id_param_offset:5;
281	} vs_epilog;
282	struct {
283		struct si_tcs_epilog_bits states;
284	} tcs_epilog;
285	struct {
286		struct si_ps_prolog_bits states;
287		unsigned	num_input_sgprs:5;
288		unsigned	num_input_vgprs:5;
289		/* Color interpolation and two-side color selection. */
290		unsigned	colors_read:8; /* color input components read */
291		unsigned	num_interp_inputs:5; /* BCOLOR is at this location */
292		unsigned	face_vgpr_index:5;
293		char		color_attr_index[2];
294		char		color_interp_vgpr_index[2]; /* -1 == constant */
295	} ps_prolog;
296	struct {
297		struct si_ps_epilog_bits states;
298		unsigned	colors_written:8;
299		unsigned	writes_z:1;
300		unsigned	writes_stencil:1;
301		unsigned	writes_samplemask:1;
302	} ps_epilog;
303};
304
305union si_shader_key {
306	struct {
307		struct si_ps_prolog_bits prolog;
308		struct si_ps_epilog_bits epilog;
309	} ps;
310	struct {
311		struct si_vs_prolog_bits prolog;
312		struct si_vs_epilog_bits epilog;
313		unsigned	as_es:1; /* export shader */
314		unsigned	as_ls:1; /* local shader */
315	} vs;
316	struct {
317		struct si_tcs_epilog_bits epilog;
318	} tcs; /* tessellation control shader */
319	struct {
320		struct si_vs_epilog_bits epilog; /* same as VS */
321		unsigned	as_es:1; /* export shader */
322	} tes; /* tessellation evaluation shader */
323};
324
325struct si_shader_config {
326	unsigned			num_sgprs;
327	unsigned			num_vgprs;
328	unsigned			lds_size;
329	unsigned			spi_ps_input_ena;
330	unsigned			spi_ps_input_addr;
331	unsigned			float_mode;
332	unsigned			scratch_bytes_per_wave;
333	unsigned			rsrc1;
334	unsigned			rsrc2;
335};
336
337struct si_shader {
338	struct si_shader_selector	*selector;
339	struct si_shader		*next_variant;
340
341	struct si_shader_part		*prolog;
342	struct si_shader_part		*epilog;
343
344	struct si_shader		*gs_copy_shader;
345	struct si_pm4_state		*pm4;
346	struct r600_resource		*bo;
347	struct r600_resource		*scratch_bo;
348	union si_shader_key		key;
349	struct radeon_shader_binary	binary;
350	struct si_shader_config		config;
351
352	ubyte			num_input_sgprs;
353	ubyte			num_input_vgprs;
354	char			face_vgpr_index;
355
356	unsigned		vs_output_param_offset[PIPE_MAX_SHADER_OUTPUTS];
357	bool			uses_instanceid;
358	unsigned		nr_pos_exports;
359	unsigned		nr_param_exports;
360};
361
362struct si_shader_part {
363	struct si_shader_part *next;
364	union si_shader_part_key key;
365	struct radeon_shader_binary binary;
366	struct si_shader_config config;
367};
368
369static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx)
370{
371	if (sctx->gs_shader.cso)
372		return &sctx->gs_shader.cso->info;
373	else if (sctx->tes_shader.cso)
374		return &sctx->tes_shader.cso->info;
375	else if (sctx->vs_shader.cso)
376		return &sctx->vs_shader.cso->info;
377	else
378		return NULL;
379}
380
381static inline struct si_shader* si_get_vs_state(struct si_context *sctx)
382{
383	if (sctx->gs_shader.current)
384		return sctx->gs_shader.current->gs_copy_shader;
385	else if (sctx->tes_shader.current)
386		return sctx->tes_shader.current;
387	else
388		return sctx->vs_shader.current;
389}
390
391static inline bool si_vs_exports_prim_id(struct si_shader *shader)
392{
393	if (shader->selector->type == PIPE_SHADER_VERTEX)
394		return shader->key.vs.epilog.export_prim_id;
395	else if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
396		return shader->key.tes.epilog.export_prim_id;
397	else
398		return false;
399}
400
401/* si_shader.c */
402int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
403		     struct si_shader *shader,
404		     struct pipe_debug_callback *debug);
405void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f);
406int si_compile_llvm(struct si_screen *sscreen,
407		    struct radeon_shader_binary *binary,
408		    struct si_shader_config *conf,
409		    LLVMTargetMachineRef tm,
410		    LLVMModuleRef mod,
411		    struct pipe_debug_callback *debug,
412		    unsigned processor,
413		    const char *name);
414void si_shader_destroy(struct si_shader *shader);
415unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
416int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader);
417void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
418		    struct pipe_debug_callback *debug, unsigned processor);
419void si_shader_apply_scratch_relocs(struct si_context *sctx,
420			struct si_shader *shader,
421			uint64_t scratch_va);
422void si_shader_binary_read_config(struct radeon_shader_binary *binary,
423				  struct si_shader_config *conf,
424				  unsigned symbol_offset);
425
426#endif
427