r600_shader.c revision 92fbf856f42b22f68f62c2516e0c6453c454cf05
1/*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23#include "r600_sq.h"
24#include "r600_llvm.h"
25#include "r600_formats.h"
26#include "r600_opcodes.h"
27#include "r600_shader.h"
28#include "r600d.h"
29
30#include "sb/sb_public.h"
31
32#include "pipe/p_shader_tokens.h"
33#include "tgsi/tgsi_info.h"
34#include "tgsi/tgsi_parse.h"
35#include "tgsi/tgsi_scan.h"
36#include "tgsi/tgsi_dump.h"
37#include "util/u_memory.h"
38#include "util/u_math.h"
39#include <stdio.h>
40#include <errno.h>
41
42/* CAYMAN notes
43Why CAYMAN got loops for lots of instructions is explained here.
44
45-These 8xx t-slot only ops are implemented in all vector slots.
46MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
47These 8xx t-slot only opcodes become vector ops, with all four
48slots expecting the arguments on sources a and b. Result is
49broadcast to all channels.
50MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
51These 8xx t-slot only opcodes become vector ops in the z, y, and
52x slots.
53EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
54RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
55SQRT_IEEE/_64
56SIN/COS
57The w slot may have an independent co-issued operation, or if the
58result is required to be in the w slot, the opcode above may be
59issued in the w slot as well.
60The compiler must issue the source argument to slots z, y, and x
61*/
62
63#define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
64static int r600_shader_from_tgsi(struct r600_context *rctx,
65				 struct r600_pipe_shader *pipeshader,
66				 union r600_shader_key key);
67
68
69static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
70                           int size, unsigned comp_mask) {
71
72	if (!size)
73		return;
74
75	if (ps->num_arrays == ps->max_arrays) {
76		ps->max_arrays += 64;
77		ps->arrays = realloc(ps->arrays, ps->max_arrays *
78		                     sizeof(struct r600_shader_array));
79	}
80
81	int n = ps->num_arrays;
82	++ps->num_arrays;
83
84	ps->arrays[n].comp_mask = comp_mask;
85	ps->arrays[n].gpr_start = start_gpr;
86	ps->arrays[n].gpr_count = size;
87}
88
89static void r600_dump_streamout(struct pipe_stream_output_info *so)
90{
91	unsigned i;
92
93	fprintf(stderr, "STREAMOUT\n");
94	for (i = 0; i < so->num_outputs; i++) {
95		unsigned mask = ((1 << so->output[i].num_components) - 1) <<
96				so->output[i].start_component;
97		fprintf(stderr, "  %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
98			i,
99			so->output[i].stream,
100			so->output[i].output_buffer,
101			so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
102			so->output[i].register_index,
103			mask & 1 ? "x" : "",
104		        mask & 2 ? "y" : "",
105		        mask & 4 ? "z" : "",
106		        mask & 8 ? "w" : "",
107			so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
108	}
109}
110
111static int store_shader(struct pipe_context *ctx,
112			struct r600_pipe_shader *shader)
113{
114	struct r600_context *rctx = (struct r600_context *)ctx;
115	uint32_t *ptr, i;
116
117	if (shader->bo == NULL) {
118		shader->bo = (struct r600_resource*)
119			pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
120		if (shader->bo == NULL) {
121			return -ENOMEM;
122		}
123		ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE);
124		if (R600_BIG_ENDIAN) {
125			for (i = 0; i < shader->shader.bc.ndw; ++i) {
126				ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
127			}
128		} else {
129			memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
130		}
131		rctx->b.ws->buffer_unmap(shader->bo->cs_buf);
132	}
133
134	return 0;
135}
136
137int r600_pipe_shader_create(struct pipe_context *ctx,
138			    struct r600_pipe_shader *shader,
139			    union r600_shader_key key)
140{
141	struct r600_context *rctx = (struct r600_context *)ctx;
142	struct r600_pipe_shader_selector *sel = shader->selector;
143	int r;
144	bool dump = r600_can_dump_shader(&rctx->screen->b, sel->tokens);
145	unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
146	unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
147	unsigned export_shader;
148
149	shader->shader.bc.isa = rctx->isa;
150
151	if (dump) {
152		fprintf(stderr, "--------------------------------------------------------------\n");
153		tgsi_dump(sel->tokens, 0);
154
155		if (sel->so.num_outputs) {
156			r600_dump_streamout(&sel->so);
157		}
158	}
159	r = r600_shader_from_tgsi(rctx, shader, key);
160	if (r) {
161		R600_ERR("translation from TGSI failed !\n");
162		goto error;
163	}
164	if (shader->shader.processor_type == TGSI_PROCESSOR_VERTEX) {
165		/* only disable for vertex shaders in tess paths */
166		if (key.vs.as_ls)
167			use_sb = 0;
168	}
169	use_sb &= (shader->shader.processor_type != TGSI_PROCESSOR_TESS_CTRL);
170	use_sb &= (shader->shader.processor_type != TGSI_PROCESSOR_TESS_EVAL);
171
172	/* disable SB for shaders using doubles */
173	use_sb &= !shader->shader.uses_doubles;
174
175	/* Check if the bytecode has already been built.  When using the llvm
176	 * backend, r600_shader_from_tgsi() will take care of building the
177	 * bytecode.
178	 */
179	if (!shader->shader.bc.bytecode) {
180		r = r600_bytecode_build(&shader->shader.bc);
181		if (r) {
182			R600_ERR("building bytecode failed !\n");
183			goto error;
184		}
185	}
186
187	if (dump && !sb_disasm) {
188		fprintf(stderr, "--------------------------------------------------------------\n");
189		r600_bytecode_disasm(&shader->shader.bc);
190		fprintf(stderr, "______________________________________________________________\n");
191	} else if ((dump && sb_disasm) || use_sb) {
192		r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
193		                             dump, use_sb);
194		if (r) {
195			R600_ERR("r600_sb_bytecode_process failed !\n");
196			goto error;
197		}
198	}
199
200	if (shader->gs_copy_shader) {
201		if (dump) {
202			// dump copy shader
203			r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
204						     &shader->gs_copy_shader->shader, dump, 0);
205			if (r)
206				goto error;
207		}
208
209		if ((r = store_shader(ctx, shader->gs_copy_shader)))
210			goto error;
211	}
212
213	/* Store the shader in a buffer. */
214	if ((r = store_shader(ctx, shader)))
215		goto error;
216
217	/* Build state. */
218	switch (shader->shader.processor_type) {
219	case TGSI_PROCESSOR_TESS_CTRL:
220		evergreen_update_hs_state(ctx, shader);
221		break;
222	case TGSI_PROCESSOR_TESS_EVAL:
223		if (key.tes.as_es)
224			evergreen_update_es_state(ctx, shader);
225		else
226			evergreen_update_vs_state(ctx, shader);
227		break;
228	case TGSI_PROCESSOR_GEOMETRY:
229		if (rctx->b.chip_class >= EVERGREEN) {
230			evergreen_update_gs_state(ctx, shader);
231			evergreen_update_vs_state(ctx, shader->gs_copy_shader);
232		} else {
233			r600_update_gs_state(ctx, shader);
234			r600_update_vs_state(ctx, shader->gs_copy_shader);
235		}
236		break;
237	case TGSI_PROCESSOR_VERTEX:
238		export_shader = key.vs.as_es;
239		if (rctx->b.chip_class >= EVERGREEN) {
240			if (key.vs.as_ls)
241				evergreen_update_ls_state(ctx, shader);
242			else if (key.vs.as_es)
243				evergreen_update_es_state(ctx, shader);
244			else
245				evergreen_update_vs_state(ctx, shader);
246		} else {
247			if (export_shader)
248				r600_update_es_state(ctx, shader);
249			else
250				r600_update_vs_state(ctx, shader);
251		}
252		break;
253	case TGSI_PROCESSOR_FRAGMENT:
254		if (rctx->b.chip_class >= EVERGREEN) {
255			evergreen_update_ps_state(ctx, shader);
256		} else {
257			r600_update_ps_state(ctx, shader);
258		}
259		break;
260	default:
261		r = -EINVAL;
262		goto error;
263	}
264	return 0;
265
266error:
267	r600_pipe_shader_destroy(ctx, shader);
268	return r;
269}
270
271void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
272{
273	pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL);
274	r600_bytecode_clear(&shader->shader.bc);
275	r600_release_command_buffer(&shader->command_buffer);
276}
277
278/*
279 * tgsi -> r600 shader
280 */
281struct r600_shader_tgsi_instruction;
282
283struct r600_shader_src {
284	unsigned				sel;
285	unsigned				swizzle[4];
286	unsigned				neg;
287	unsigned				abs;
288	unsigned				rel;
289	unsigned				kc_bank;
290	boolean					kc_rel; /* true if cache bank is indexed */
291	uint32_t				value[4];
292};
293
294struct eg_interp {
295	boolean					enabled;
296	unsigned				ij_index;
297};
298
299struct r600_shader_ctx {
300	struct tgsi_shader_info			info;
301	struct tgsi_parse_context		parse;
302	const struct tgsi_token			*tokens;
303	unsigned				type;
304	unsigned				file_offset[TGSI_FILE_COUNT];
305	unsigned				temp_reg;
306	const struct r600_shader_tgsi_instruction	*inst_info;
307	struct r600_bytecode			*bc;
308	struct r600_shader			*shader;
309	struct r600_shader_src			src[4];
310	uint32_t				*literals;
311	uint32_t				nliterals;
312	uint32_t				max_driver_temp_used;
313	boolean use_llvm;
314	/* needed for evergreen interpolation */
315	struct eg_interp		eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
316	/* evergreen/cayman also store sample mask in face register */
317	int					face_gpr;
318	/* sample id is .w component stored in fixed point position register */
319	int					fixed_pt_position_gpr;
320	int					colors_used;
321	boolean                 clip_vertex_write;
322	unsigned                cv_output;
323	unsigned		edgeflag_output;
324	int					fragcoord_input;
325	int					native_integers;
326	int					next_ring_offset;
327	int					gs_out_ring_offset;
328	int					gs_next_vertex;
329	struct r600_shader	*gs_for_vs;
330	int					gs_export_gpr_tregs[4];
331	const struct pipe_stream_output_info	*gs_stream_output_info;
332	unsigned				enabled_stream_buffers_mask;
333	unsigned                                tess_input_info; /* temp with tess input offsets */
334	unsigned                                tess_output_info; /* temp with tess input offsets */
335};
336
337struct r600_shader_tgsi_instruction {
338	unsigned	op;
339	int (*process)(struct r600_shader_ctx *ctx);
340};
341
342static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
343static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
344static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
345static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
346static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
347static int tgsi_else(struct r600_shader_ctx *ctx);
348static int tgsi_endif(struct r600_shader_ctx *ctx);
349static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
350static int tgsi_endloop(struct r600_shader_ctx *ctx);
351static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
352static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
353                                unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
354                                unsigned int dst_reg);
355static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
356			const struct r600_shader_src *shader_src,
357			unsigned chan);
358
359static int tgsi_last_instruction(unsigned writemask)
360{
361	int i, lasti = 0;
362
363	for (i = 0; i < 4; i++) {
364		if (writemask & (1 << i)) {
365			lasti = i;
366		}
367	}
368	return lasti;
369}
370
371static int tgsi_is_supported(struct r600_shader_ctx *ctx)
372{
373	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
374	int j;
375
376	if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
377		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
378		return -EINVAL;
379	}
380	if (i->Instruction.Predicate) {
381		R600_ERR("predicate unsupported\n");
382		return -EINVAL;
383	}
384#if 0
385	if (i->Instruction.Label) {
386		R600_ERR("label unsupported\n");
387		return -EINVAL;
388	}
389#endif
390	for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
391		if (i->Src[j].Register.Dimension) {
392		   switch (i->Src[j].Register.File) {
393		   case TGSI_FILE_CONSTANT:
394			   break;
395		   case TGSI_FILE_INPUT:
396			   if (ctx->type == TGSI_PROCESSOR_GEOMETRY ||
397			       ctx->type == TGSI_PROCESSOR_TESS_CTRL ||
398			       ctx->type == TGSI_PROCESSOR_TESS_EVAL)
399				   break;
400		   case TGSI_FILE_OUTPUT:
401			   if (ctx->type == TGSI_PROCESSOR_TESS_CTRL)
402				   break;
403		   default:
404			   R600_ERR("unsupported src %d (file %d, dimension %d)\n", j,
405				    i->Src[j].Register.File,
406				    i->Src[j].Register.Dimension);
407			   return -EINVAL;
408		   }
409		}
410	}
411	for (j = 0; j < i->Instruction.NumDstRegs; j++) {
412		if (i->Dst[j].Register.Dimension) {
413			if (ctx->type == TGSI_PROCESSOR_TESS_CTRL)
414				continue;
415			R600_ERR("unsupported dst (dimension)\n");
416			return -EINVAL;
417		}
418	}
419	return 0;
420}
421
422int eg_get_interpolator_index(unsigned interpolate, unsigned location)
423{
424	if (interpolate == TGSI_INTERPOLATE_COLOR ||
425		interpolate == TGSI_INTERPOLATE_LINEAR ||
426		interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
427	{
428		int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
429		int loc;
430
431		switch(location) {
432		case TGSI_INTERPOLATE_LOC_CENTER:
433			loc = 1;
434			break;
435		case TGSI_INTERPOLATE_LOC_CENTROID:
436			loc = 2;
437			break;
438		case TGSI_INTERPOLATE_LOC_SAMPLE:
439		default:
440			loc = 0; break;
441		}
442
443		return is_linear * 3 + loc;
444	}
445
446	return -1;
447}
448
449static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
450		int input)
451{
452	int i = eg_get_interpolator_index(
453		ctx->shader->input[input].interpolate,
454		ctx->shader->input[input].interpolate_location);
455	assert(i >= 0);
456	ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
457}
458
459static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
460{
461	int i, r;
462	struct r600_bytecode_alu alu;
463	int gpr = 0, base_chan = 0;
464	int ij_index = ctx->shader->input[input].ij_index;
465
466	/* work out gpr and base_chan from index */
467	gpr = ij_index / 2;
468	base_chan = (2 * (ij_index % 2)) + 1;
469
470	for (i = 0; i < 8; i++) {
471		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
472
473		if (i < 4)
474			alu.op = ALU_OP2_INTERP_ZW;
475		else
476			alu.op = ALU_OP2_INTERP_XY;
477
478		if ((i > 1) && (i < 6)) {
479			alu.dst.sel = ctx->shader->input[input].gpr;
480			alu.dst.write = 1;
481		}
482
483		alu.dst.chan = i % 4;
484
485		alu.src[0].sel = gpr;
486		alu.src[0].chan = (base_chan - (i % 2));
487
488		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
489
490		alu.bank_swizzle_force = SQ_ALU_VEC_210;
491		if ((i % 4) == 3)
492			alu.last = 1;
493		r = r600_bytecode_add_alu(ctx->bc, &alu);
494		if (r)
495			return r;
496	}
497	return 0;
498}
499
500static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
501{
502	int i, r;
503	struct r600_bytecode_alu alu;
504
505	for (i = 0; i < 4; i++) {
506		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
507
508		alu.op = ALU_OP1_INTERP_LOAD_P0;
509
510		alu.dst.sel = ctx->shader->input[input].gpr;
511		alu.dst.write = 1;
512
513		alu.dst.chan = i;
514
515		alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
516		alu.src[0].chan = i;
517
518		if (i == 3)
519			alu.last = 1;
520		r = r600_bytecode_add_alu(ctx->bc, &alu);
521		if (r)
522			return r;
523	}
524	return 0;
525}
526
527/*
528 * Special export handling in shaders
529 *
530 * shader export ARRAY_BASE for EXPORT_POS:
531 * 60 is position
532 * 61 is misc vector
533 * 62, 63 are clip distance vectors
534 *
535 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
536 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
537 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
538 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
539 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
540 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
541 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
542 * exclusive from render target index)
543 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
544 *
545 *
546 * shader export ARRAY_BASE for EXPORT_PIXEL:
547 * 0-7 CB targets
548 * 61 computed Z vector
549 *
550 * The use of the values exported in the computed Z vector are controlled
551 * by DB_SHADER_CONTROL:
552 * Z_EXPORT_ENABLE - Z as a float in RED
553 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
554 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
555 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
556 * DB_SOURCE_FORMAT - export control restrictions
557 *
558 */
559
560
561/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
562static int r600_spi_sid(struct r600_shader_io * io)
563{
564	int index, name = io->name;
565
566	/* These params are handled differently, they don't need
567	 * semantic indices, so we'll use 0 for them.
568	 */
569	if (name == TGSI_SEMANTIC_POSITION ||
570	    name == TGSI_SEMANTIC_PSIZE ||
571	    name == TGSI_SEMANTIC_EDGEFLAG ||
572	    name == TGSI_SEMANTIC_FACE ||
573	    name == TGSI_SEMANTIC_SAMPLEMASK)
574		index = 0;
575	else {
576		if (name == TGSI_SEMANTIC_GENERIC) {
577			/* For generic params simply use sid from tgsi */
578			index = io->sid;
579		} else {
580			/* For non-generic params - pack name and sid into 8 bits */
581			index = 0x80 | (name<<3) | (io->sid);
582		}
583
584		/* Make sure that all really used indices have nonzero value, so
585		 * we can just compare it to 0 later instead of comparing the name
586		 * with different values to detect special cases. */
587		index++;
588	}
589
590	return index;
591};
592
593/* we need this to get a common lds index for vs/tcs/tes input/outputs */
594int r600_get_lds_unique_index(unsigned semantic_name, unsigned index)
595{
596	switch (semantic_name) {
597	case TGSI_SEMANTIC_POSITION:
598		return 0;
599	case TGSI_SEMANTIC_PSIZE:
600		return 1;
601	case TGSI_SEMANTIC_CLIPDIST:
602		assert(index <= 1);
603		return 2 + index;
604	case TGSI_SEMANTIC_GENERIC:
605		if (index <= 63-4)
606			return 4 + index - 9;
607		else
608			/* same explanation as in the default statement,
609			 * the only user hitting this is st/nine.
610			 */
611			return 0;
612
613	/* patch indices are completely separate and thus start from 0 */
614	case TGSI_SEMANTIC_TESSOUTER:
615		return 0;
616	case TGSI_SEMANTIC_TESSINNER:
617		return 1;
618	case TGSI_SEMANTIC_PATCH:
619		return 2 + index;
620
621	default:
622		/* Don't fail here. The result of this function is only used
623		 * for LS, TCS, TES, and GS, where legacy GL semantics can't
624		 * occur, but this function is called for all vertex shaders
625		 * before it's known whether LS will be compiled or not.
626		 */
627		return 0;
628	}
629}
630
631/* turn input into interpolate on EG */
632static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
633{
634	int r = 0;
635
636	if (ctx->shader->input[index].spi_sid) {
637		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
638		if (ctx->shader->input[index].interpolate > 0) {
639			evergreen_interp_assign_ij_index(ctx, index);
640			if (!ctx->use_llvm)
641				r = evergreen_interp_alu(ctx, index);
642		} else {
643			if (!ctx->use_llvm)
644				r = evergreen_interp_flat(ctx, index);
645		}
646	}
647	return r;
648}
649
650static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
651{
652	struct r600_bytecode_alu alu;
653	int i, r;
654	int gpr_front = ctx->shader->input[front].gpr;
655	int gpr_back = ctx->shader->input[back].gpr;
656
657	for (i = 0; i < 4; i++) {
658		memset(&alu, 0, sizeof(alu));
659		alu.op = ALU_OP3_CNDGT;
660		alu.is_op3 = 1;
661		alu.dst.write = 1;
662		alu.dst.sel = gpr_front;
663		alu.src[0].sel = ctx->face_gpr;
664		alu.src[1].sel = gpr_front;
665		alu.src[2].sel = gpr_back;
666
667		alu.dst.chan = i;
668		alu.src[1].chan = i;
669		alu.src[2].chan = i;
670		alu.last = (i==3);
671
672		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
673			return r;
674	}
675
676	return 0;
677}
678
679/* execute a single slot ALU calculation */
680static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
681			  int dst_sel, int dst_chan,
682			  int src0_sel, unsigned src0_chan_val,
683			  int src1_sel, unsigned src1_chan_val)
684{
685	struct r600_bytecode_alu alu;
686	int r, i;
687
688	if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) {
689		for (i = 0; i < 4; i++) {
690			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
691			alu.op = op;
692			alu.src[0].sel = src0_sel;
693			if (src0_sel == V_SQ_ALU_SRC_LITERAL)
694				alu.src[0].value = src0_chan_val;
695			else
696				alu.src[0].chan = src0_chan_val;
697			alu.src[1].sel = src1_sel;
698			if (src1_sel == V_SQ_ALU_SRC_LITERAL)
699				alu.src[1].value = src1_chan_val;
700			else
701				alu.src[1].chan = src1_chan_val;
702			alu.dst.sel = dst_sel;
703			alu.dst.chan = i;
704			alu.dst.write = i == dst_chan;
705			alu.last = (i == 3);
706			r = r600_bytecode_add_alu(ctx->bc, &alu);
707			if (r)
708				return r;
709		}
710		return 0;
711	}
712
713	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
714	alu.op = op;
715	alu.src[0].sel = src0_sel;
716	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
717		alu.src[0].value = src0_chan_val;
718	else
719		alu.src[0].chan = src0_chan_val;
720	alu.src[1].sel = src1_sel;
721	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
722		alu.src[1].value = src1_chan_val;
723	else
724		alu.src[1].chan = src1_chan_val;
725	alu.dst.sel = dst_sel;
726	alu.dst.chan = dst_chan;
727	alu.dst.write = 1;
728	alu.last = 1;
729	r = r600_bytecode_add_alu(ctx->bc, &alu);
730	if (r)
731		return r;
732	return 0;
733}
734
735/* execute a single slot ALU calculation */
736static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
737			  int dst_sel, int dst_chan,
738			  int src0_sel, unsigned src0_chan_val,
739			  int src1_sel, unsigned src1_chan_val,
740			  int src2_sel, unsigned src2_chan_val)
741{
742	struct r600_bytecode_alu alu;
743	int r;
744
745	/* validate this for other ops */
746	assert(op == ALU_OP3_MULADD_UINT24);
747	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
748	alu.op = op;
749	alu.src[0].sel = src0_sel;
750	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
751		alu.src[0].value = src0_chan_val;
752	else
753		alu.src[0].chan = src0_chan_val;
754	alu.src[1].sel = src1_sel;
755	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
756		alu.src[1].value = src1_chan_val;
757	else
758		alu.src[1].chan = src1_chan_val;
759	alu.src[2].sel = src2_sel;
760	if (src2_sel == V_SQ_ALU_SRC_LITERAL)
761		alu.src[2].value = src2_chan_val;
762	else
763		alu.src[2].chan = src2_chan_val;
764	alu.dst.sel = dst_sel;
765	alu.dst.chan = dst_chan;
766	alu.is_op3 = 1;
767	alu.last = 1;
768	r = r600_bytecode_add_alu(ctx->bc, &alu);
769	if (r)
770		return r;
771	return 0;
772}
773
774/* put it in temp_reg.x */
775static int get_lds_offset0(struct r600_shader_ctx *ctx,
776			   int rel_patch_chan,
777			   int temp_reg, bool is_patch_var)
778{
779	int r;
780
781	/* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */
782	/* ADD
783	   Dimension - patch0_offset (input_vals.z),
784	   Non-dim - patch0_data_offset (input_vals.w)
785	*/
786	r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
787			   temp_reg, 0,
788			   ctx->tess_output_info, 0,
789			   0, rel_patch_chan,
790			   ctx->tess_output_info, is_patch_var ? 3 : 2);
791	if (r)
792		return r;
793	return 0;
794}
795
796static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
797{
798	return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
799}
800
801static int r600_get_temp(struct r600_shader_ctx *ctx)
802{
803	return ctx->temp_reg + ctx->max_driver_temp_used++;
804}
805
806static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
807{
808	int i;
809	i = ctx->shader->noutput++;
810	ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
811	ctx->shader->output[i].sid = 0;
812	ctx->shader->output[i].gpr = 0;
813	ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
814	ctx->shader->output[i].write_mask = 0x4;
815	ctx->shader->output[i].spi_sid = prim_id_sid;
816
817	return 0;
818}
819
820static int tgsi_declaration(struct r600_shader_ctx *ctx)
821{
822	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
823	int r, i, j, count = d->Range.Last - d->Range.First + 1;
824
825	switch (d->Declaration.File) {
826	case TGSI_FILE_INPUT:
827		for (j = 0; j < count; j++) {
828			i = ctx->shader->ninput + j;
829			assert(i < Elements(ctx->shader->input));
830			ctx->shader->input[i].name = d->Semantic.Name;
831			ctx->shader->input[i].sid = d->Semantic.Index + j;
832			ctx->shader->input[i].interpolate = d->Interp.Interpolate;
833			ctx->shader->input[i].interpolate_location = d->Interp.Location;
834			ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
835			if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
836				ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
837				switch (ctx->shader->input[i].name) {
838				case TGSI_SEMANTIC_FACE:
839					if (ctx->face_gpr != -1)
840						ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
841					else
842						ctx->face_gpr = ctx->shader->input[i].gpr;
843					break;
844				case TGSI_SEMANTIC_COLOR:
845					ctx->colors_used++;
846					break;
847				case TGSI_SEMANTIC_POSITION:
848					ctx->fragcoord_input = i;
849					break;
850				case TGSI_SEMANTIC_PRIMID:
851					/* set this for now */
852					ctx->shader->gs_prim_id_input = true;
853					ctx->shader->ps_prim_id_input = i;
854					break;
855				}
856				if (ctx->bc->chip_class >= EVERGREEN) {
857					if ((r = evergreen_interp_input(ctx, i)))
858						return r;
859				}
860			} else if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
861				/* FIXME probably skip inputs if they aren't passed in the ring */
862				ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
863				ctx->next_ring_offset += 16;
864				if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
865					ctx->shader->gs_prim_id_input = true;
866			}
867		}
868		ctx->shader->ninput += count;
869		break;
870	case TGSI_FILE_OUTPUT:
871		for (j = 0; j < count; j++) {
872			i = ctx->shader->noutput + j;
873			assert(i < Elements(ctx->shader->output));
874			ctx->shader->output[i].name = d->Semantic.Name;
875			ctx->shader->output[i].sid = d->Semantic.Index + j;
876			ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
877			ctx->shader->output[i].interpolate = d->Interp.Interpolate;
878			ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
879			if (ctx->type == TGSI_PROCESSOR_VERTEX ||
880			    ctx->type == TGSI_PROCESSOR_GEOMETRY ||
881			    ctx->type == TGSI_PROCESSOR_TESS_EVAL) {
882				ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
883				switch (d->Semantic.Name) {
884				case TGSI_SEMANTIC_CLIPDIST:
885					ctx->shader->clip_dist_write |= d->Declaration.UsageMask <<
886									((d->Semantic.Index + j) << 2);
887					break;
888				case TGSI_SEMANTIC_PSIZE:
889					ctx->shader->vs_out_misc_write = 1;
890					ctx->shader->vs_out_point_size = 1;
891					break;
892				case TGSI_SEMANTIC_EDGEFLAG:
893					ctx->shader->vs_out_misc_write = 1;
894					ctx->shader->vs_out_edgeflag = 1;
895					ctx->edgeflag_output = i;
896					break;
897				case TGSI_SEMANTIC_VIEWPORT_INDEX:
898					ctx->shader->vs_out_misc_write = 1;
899					ctx->shader->vs_out_viewport = 1;
900					break;
901				case TGSI_SEMANTIC_LAYER:
902					ctx->shader->vs_out_misc_write = 1;
903					ctx->shader->vs_out_layer = 1;
904					break;
905				case TGSI_SEMANTIC_CLIPVERTEX:
906					ctx->clip_vertex_write = TRUE;
907					ctx->cv_output = i;
908					break;
909				}
910				if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
911					ctx->gs_out_ring_offset += 16;
912				}
913			} else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
914				switch (d->Semantic.Name) {
915				case TGSI_SEMANTIC_COLOR:
916					ctx->shader->nr_ps_max_color_exports++;
917					break;
918				}
919			}
920		}
921		ctx->shader->noutput += count;
922		break;
923	case TGSI_FILE_TEMPORARY:
924		if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
925			if (d->Array.ArrayID) {
926				r600_add_gpr_array(ctx->shader,
927				               ctx->file_offset[TGSI_FILE_TEMPORARY] +
928								   d->Range.First,
929				               d->Range.Last - d->Range.First + 1, 0x0F);
930			}
931		}
932		break;
933
934	case TGSI_FILE_CONSTANT:
935	case TGSI_FILE_SAMPLER:
936	case TGSI_FILE_SAMPLER_VIEW:
937	case TGSI_FILE_ADDRESS:
938		break;
939
940	case TGSI_FILE_SYSTEM_VALUE:
941		if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
942			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
943			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
944			break; /* Already handled from allocate_system_value_inputs */
945		} else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
946			if (!ctx->native_integers) {
947				struct r600_bytecode_alu alu;
948				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
949
950				alu.op = ALU_OP1_INT_TO_FLT;
951				alu.src[0].sel = 0;
952				alu.src[0].chan = 3;
953
954				alu.dst.sel = 0;
955				alu.dst.chan = 3;
956				alu.dst.write = 1;
957				alu.last = 1;
958
959				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
960					return r;
961			}
962			break;
963		} else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
964			break;
965		else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
966			break;
967	default:
968		R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
969		return -EINVAL;
970	}
971	return 0;
972}
973
974static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
975{
976	struct tgsi_parse_context parse;
977	struct {
978		boolean enabled;
979		int *reg;
980		unsigned name, alternate_name;
981	} inputs[2] = {
982		{ false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
983
984		{ false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
985	};
986	int i, k, num_regs = 0;
987
988	if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
989		return 0;
990	}
991
992	/* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
993	while (!tgsi_parse_end_of_tokens(&parse)) {
994		tgsi_parse_token(&parse);
995
996		if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
997			const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
998			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
999				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1000				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1001			{
1002				int interpolate, location, k;
1003
1004				if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1005					location = TGSI_INTERPOLATE_LOC_CENTER;
1006					inputs[1].enabled = true; /* needs SAMPLEID */
1007				} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1008					location = TGSI_INTERPOLATE_LOC_CENTER;
1009					/* Needs sample positions, currently those are always available */
1010				} else {
1011					location = TGSI_INTERPOLATE_LOC_CENTROID;
1012				}
1013
1014				interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1015				k = eg_get_interpolator_index(interpolate, location);
1016				ctx->eg_interpolators[k].enabled = true;
1017			}
1018		} else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
1019			struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
1020			if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1021				for (k = 0; k < Elements(inputs); k++) {
1022					if (d->Semantic.Name == inputs[k].name ||
1023						d->Semantic.Name == inputs[k].alternate_name) {
1024						inputs[k].enabled = true;
1025					}
1026				}
1027			}
1028		}
1029	}
1030
1031	tgsi_parse_free(&parse);
1032
1033	for (i = 0; i < Elements(inputs); i++) {
1034		boolean enabled = inputs[i].enabled;
1035		int *reg = inputs[i].reg;
1036		unsigned name = inputs[i].name;
1037
1038		if (enabled) {
1039			int gpr = gpr_offset + num_regs++;
1040
1041			// add to inputs, allocate a gpr
1042			k = ctx->shader->ninput ++;
1043			ctx->shader->input[k].name = name;
1044			ctx->shader->input[k].sid = 0;
1045			ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
1046			ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
1047			*reg = ctx->shader->input[k].gpr = gpr;
1048		}
1049	}
1050
1051	return gpr_offset + num_regs;
1052}
1053
1054/*
1055 * for evergreen we need to scan the shader to find the number of GPRs we need to
1056 * reserve for interpolation and system values
1057 *
1058 * we need to know if we are going to emit
1059 * any sample or centroid inputs
1060 * if perspective and linear are required
1061*/
1062static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
1063{
1064	int i;
1065	int num_baryc;
1066	struct tgsi_parse_context parse;
1067
1068	memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
1069
1070	for (i = 0; i < ctx->info.num_inputs; i++) {
1071		int k;
1072		/* skip position/face/mask/sampleid */
1073		if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
1074		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
1075		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
1076		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
1077			continue;
1078
1079		k = eg_get_interpolator_index(
1080			ctx->info.input_interpolate[i],
1081			ctx->info.input_interpolate_loc[i]);
1082		if (k >= 0)
1083			ctx->eg_interpolators[k].enabled = TRUE;
1084	}
1085
1086	if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1087		return 0;
1088	}
1089
1090	/* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1091	while (!tgsi_parse_end_of_tokens(&parse)) {
1092		tgsi_parse_token(&parse);
1093
1094		if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1095			const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1096			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1097				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1098				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1099			{
1100				int interpolate, location, k;
1101
1102				if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1103					location = TGSI_INTERPOLATE_LOC_CENTER;
1104				} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1105					location = TGSI_INTERPOLATE_LOC_CENTER;
1106				} else {
1107					location = TGSI_INTERPOLATE_LOC_CENTROID;
1108				}
1109
1110				interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1111				k = eg_get_interpolator_index(interpolate, location);
1112				ctx->eg_interpolators[k].enabled = true;
1113			}
1114		}
1115	}
1116
1117	tgsi_parse_free(&parse);
1118
1119	/* assign gpr to each interpolator according to priority */
1120	num_baryc = 0;
1121	for (i = 0; i < Elements(ctx->eg_interpolators); i++) {
1122		if (ctx->eg_interpolators[i].enabled) {
1123			ctx->eg_interpolators[i].ij_index = num_baryc;
1124			num_baryc ++;
1125		}
1126	}
1127
1128	/* XXX PULL MODEL and LINE STIPPLE */
1129
1130	num_baryc = (num_baryc + 1) >> 1;
1131	return allocate_system_value_inputs(ctx, num_baryc);
1132}
1133
1134/* sample_id_sel == NULL means fetch for current sample */
1135static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
1136{
1137	struct r600_bytecode_vtx vtx;
1138	int r, t1;
1139
1140	assert(ctx->fixed_pt_position_gpr != -1);
1141
1142	t1 = r600_get_temp(ctx);
1143
1144	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1145	vtx.op = FETCH_OP_VFETCH;
1146	vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1147	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1148	if (sample_id == NULL) {
1149		vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
1150		vtx.src_sel_x = 3;
1151	}
1152	else {
1153		struct r600_bytecode_alu alu;
1154
1155		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1156		alu.op = ALU_OP1_MOV;
1157		r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
1158		alu.dst.sel = t1;
1159		alu.dst.write = 1;
1160		alu.last = 1;
1161		r = r600_bytecode_add_alu(ctx->bc, &alu);
1162		if (r)
1163			return r;
1164
1165		vtx.src_gpr = t1;
1166		vtx.src_sel_x = 0;
1167	}
1168	vtx.mega_fetch_count = 16;
1169	vtx.dst_gpr = t1;
1170	vtx.dst_sel_x = 0;
1171	vtx.dst_sel_y = 1;
1172	vtx.dst_sel_z = 2;
1173	vtx.dst_sel_w = 3;
1174	vtx.data_format = FMT_32_32_32_32_FLOAT;
1175	vtx.num_format_all = 2;
1176	vtx.format_comp_all = 1;
1177	vtx.use_const_fields = 0;
1178	vtx.offset = 1; // first element is size of buffer
1179	vtx.endian = r600_endian_swap(32);
1180	vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1181
1182	r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1183	if (r)
1184		return r;
1185
1186	return t1;
1187}
1188
1189static void tgsi_src(struct r600_shader_ctx *ctx,
1190		     const struct tgsi_full_src_register *tgsi_src,
1191		     struct r600_shader_src *r600_src)
1192{
1193	memset(r600_src, 0, sizeof(*r600_src));
1194	r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
1195	r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1196	r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1197	r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1198	r600_src->neg = tgsi_src->Register.Negate;
1199	r600_src->abs = tgsi_src->Register.Absolute;
1200
1201	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1202		int index;
1203		if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1204			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1205			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1206
1207			index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1208			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs);
1209			if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1210				return;
1211		}
1212		index = tgsi_src->Register.Index;
1213		r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1214		memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1215	} else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1216		if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1217			r600_src->swizzle[0] = 2; // Z value
1218			r600_src->swizzle[1] = 2;
1219			r600_src->swizzle[2] = 2;
1220			r600_src->swizzle[3] = 2;
1221			r600_src->sel = ctx->face_gpr;
1222		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1223			r600_src->swizzle[0] = 3; // W value
1224			r600_src->swizzle[1] = 3;
1225			r600_src->swizzle[2] = 3;
1226			r600_src->swizzle[3] = 3;
1227			r600_src->sel = ctx->fixed_pt_position_gpr;
1228		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1229			r600_src->swizzle[0] = 0;
1230			r600_src->swizzle[1] = 1;
1231			r600_src->swizzle[2] = 4;
1232			r600_src->swizzle[3] = 4;
1233			r600_src->sel = load_sample_position(ctx, NULL, -1);
1234		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1235			r600_src->swizzle[0] = 3;
1236			r600_src->swizzle[1] = 3;
1237			r600_src->swizzle[2] = 3;
1238			r600_src->swizzle[3] = 3;
1239			r600_src->sel = 0;
1240		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1241			r600_src->swizzle[0] = 0;
1242			r600_src->swizzle[1] = 0;
1243			r600_src->swizzle[2] = 0;
1244			r600_src->swizzle[3] = 0;
1245			r600_src->sel = 0;
1246		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1247			r600_src->swizzle[0] = 3;
1248			r600_src->swizzle[1] = 3;
1249			r600_src->swizzle[2] = 3;
1250			r600_src->swizzle[3] = 3;
1251			r600_src->sel = 1;
1252		}
1253	} else {
1254		if (tgsi_src->Register.Indirect)
1255			r600_src->rel = V_SQ_REL_RELATIVE;
1256		r600_src->sel = tgsi_src->Register.Index;
1257		r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1258	}
1259	if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1260		if (tgsi_src->Register.Dimension) {
1261			r600_src->kc_bank = tgsi_src->Dimension.Index;
1262			if (tgsi_src->Dimension.Indirect) {
1263				r600_src->kc_rel = 1;
1264			}
1265		}
1266	}
1267}
1268
1269static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1270                                unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1271                                unsigned int dst_reg)
1272{
1273	struct r600_bytecode_vtx vtx;
1274	unsigned int ar_reg;
1275	int r;
1276
1277	if (offset) {
1278		struct r600_bytecode_alu alu;
1279
1280		memset(&alu, 0, sizeof(alu));
1281
1282		alu.op = ALU_OP2_ADD_INT;
1283		alu.src[0].sel = ctx->bc->ar_reg;
1284		alu.src[0].chan = ar_chan;
1285
1286		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1287		alu.src[1].value = offset;
1288
1289		alu.dst.sel = dst_reg;
1290		alu.dst.chan = ar_chan;
1291		alu.dst.write = 1;
1292		alu.last = 1;
1293
1294		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1295			return r;
1296
1297		ar_reg = dst_reg;
1298	} else {
1299		ar_reg = ctx->bc->ar_reg;
1300	}
1301
1302	memset(&vtx, 0, sizeof(vtx));
1303	vtx.buffer_id = cb_idx;
1304	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1305	vtx.src_gpr = ar_reg;
1306	vtx.src_sel_x = ar_chan;
1307	vtx.mega_fetch_count = 16;
1308	vtx.dst_gpr = dst_reg;
1309	vtx.dst_sel_x = 0;		/* SEL_X */
1310	vtx.dst_sel_y = 1;		/* SEL_Y */
1311	vtx.dst_sel_z = 2;		/* SEL_Z */
1312	vtx.dst_sel_w = 3;		/* SEL_W */
1313	vtx.data_format = FMT_32_32_32_32_FLOAT;
1314	vtx.num_format_all = 2;		/* NUM_FORMAT_SCALED */
1315	vtx.format_comp_all = 1;	/* FORMAT_COMP_SIGNED */
1316	vtx.endian = r600_endian_swap(32);
1317	vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1318
1319	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1320		return r;
1321
1322	return 0;
1323}
1324
1325static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1326{
1327	struct r600_bytecode_vtx vtx;
1328	int r;
1329	unsigned index = src->Register.Index;
1330	unsigned vtx_id = src->Dimension.Index;
1331	int offset_reg = vtx_id / 3;
1332	int offset_chan = vtx_id % 3;
1333
1334	/* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1335	 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1336
1337	if (offset_reg == 0 && offset_chan == 2)
1338		offset_chan = 3;
1339
1340	if (src->Dimension.Indirect) {
1341		int treg[3];
1342		int t2;
1343		struct r600_bytecode_alu alu;
1344		int r, i;
1345
1346		/* you have got to be shitting me -
1347		   we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1348		   at least this is what fglrx seems to do. */
1349		for (i = 0; i < 3; i++) {
1350			treg[i] = r600_get_temp(ctx);
1351		}
1352		r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
1353
1354		t2 = r600_get_temp(ctx);
1355		for (i = 0; i < 3; i++) {
1356			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1357			alu.op = ALU_OP1_MOV;
1358			alu.src[0].sel = 0;
1359			alu.src[0].chan = i == 2 ? 3 : i;
1360			alu.dst.sel = treg[i];
1361			alu.dst.chan = 0;
1362			alu.dst.write = 1;
1363			alu.last = 1;
1364			r = r600_bytecode_add_alu(ctx->bc, &alu);
1365			if (r)
1366				return r;
1367		}
1368		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1369		alu.op = ALU_OP1_MOV;
1370		alu.src[0].sel = treg[0];
1371		alu.src[0].rel = 1;
1372		alu.dst.sel = t2;
1373		alu.dst.write = 1;
1374		alu.last = 1;
1375		r = r600_bytecode_add_alu(ctx->bc, &alu);
1376		if (r)
1377			return r;
1378		offset_reg = t2;
1379	}
1380
1381
1382	memset(&vtx, 0, sizeof(vtx));
1383	vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1384	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1385	vtx.src_gpr = offset_reg;
1386	vtx.src_sel_x = offset_chan;
1387	vtx.offset = index * 16; /*bytes*/
1388	vtx.mega_fetch_count = 16;
1389	vtx.dst_gpr = dst_reg;
1390	vtx.dst_sel_x = 0;		/* SEL_X */
1391	vtx.dst_sel_y = 1;		/* SEL_Y */
1392	vtx.dst_sel_z = 2;		/* SEL_Z */
1393	vtx.dst_sel_w = 3;		/* SEL_W */
1394	if (ctx->bc->chip_class >= EVERGREEN) {
1395		vtx.use_const_fields = 1;
1396	} else {
1397		vtx.data_format = FMT_32_32_32_32_FLOAT;
1398	}
1399
1400	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1401		return r;
1402
1403	return 0;
1404}
1405
1406static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1407{
1408	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1409	int i;
1410
1411	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1412		struct tgsi_full_src_register *src = &inst->Src[i];
1413
1414		if (src->Register.File == TGSI_FILE_INPUT) {
1415			if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
1416				/* primitive id is in R0.z */
1417				ctx->src[i].sel = 0;
1418				ctx->src[i].swizzle[0] = 2;
1419			}
1420		}
1421		if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
1422			int treg = r600_get_temp(ctx);
1423
1424			fetch_gs_input(ctx, src, treg);
1425			ctx->src[i].sel = treg;
1426		}
1427	}
1428	return 0;
1429}
1430
1431
1432/* Tessellation shaders pass outputs to the next shader using LDS.
1433 *
1434 * LS outputs = TCS(HS) inputs
1435 * TCS(HS) outputs = TES(DS) inputs
1436 *
1437 * The LDS layout is:
1438 * - TCS inputs for patch 0
1439 * - TCS inputs for patch 1
1440 * - TCS inputs for patch 2		= get_tcs_in_current_patch_offset (if RelPatchID==2)
1441 * - ...
1442 * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
1443 * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
1444 * - TCS outputs for patch 1
1445 * - Per-patch TCS outputs for patch 1
1446 * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
1447 * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
1448 * - ...
1449 *
1450 * All three shaders VS(LS), TCS, TES share the same LDS space.
1451 */
1452/* this will return with the dw address in temp_reg.x */
1453static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
1454				 const struct tgsi_full_dst_register *dst,
1455				 const struct tgsi_full_src_register *src,
1456				 int stride_bytes_reg, int stride_bytes_chan)
1457{
1458	struct tgsi_full_dst_register reg;
1459	ubyte *name, *index, *array_first;
1460	int r;
1461	int param;
1462	struct tgsi_shader_info *info = &ctx->info;
1463	/* Set the register description. The address computation is the same
1464	 * for sources and destinations. */
1465	if (src) {
1466		reg.Register.File = src->Register.File;
1467		reg.Register.Index = src->Register.Index;
1468		reg.Register.Indirect = src->Register.Indirect;
1469		reg.Register.Dimension = src->Register.Dimension;
1470		reg.Indirect = src->Indirect;
1471		reg.Dimension = src->Dimension;
1472		reg.DimIndirect = src->DimIndirect;
1473	} else
1474		reg = *dst;
1475
1476	/* If the register is 2-dimensional (e.g. an array of vertices
1477	 * in a primitive), calculate the base address of the vertex. */
1478	if (reg.Register.Dimension) {
1479		int sel, chan;
1480		if (reg.Dimension.Indirect) {
1481			unsigned addr_reg;
1482			assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS);
1483
1484			addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index);
1485			/* pull the value from index_reg */
1486			sel = addr_reg;
1487			chan = 0;
1488		} else {
1489			sel = V_SQ_ALU_SRC_LITERAL;
1490			chan = reg.Dimension.Index;
1491		}
1492
1493		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1494				   temp_reg, 0,
1495				   stride_bytes_reg, stride_bytes_chan,
1496				   sel, chan,
1497				   temp_reg, 0);
1498		if (r)
1499			return r;
1500	}
1501
1502	if (reg.Register.File == TGSI_FILE_INPUT) {
1503		name = info->input_semantic_name;
1504		index = info->input_semantic_index;
1505		array_first = info->input_array_first;
1506	} else if (reg.Register.File == TGSI_FILE_OUTPUT) {
1507		name = info->output_semantic_name;
1508		index = info->output_semantic_index;
1509		array_first = info->output_array_first;
1510	} else {
1511		assert(0);
1512		return -1;
1513	}
1514	if (reg.Register.Indirect) {
1515		int addr_reg;
1516		int first;
1517		/* Add the relative address of the element. */
1518		if (reg.Indirect.ArrayID)
1519			first = array_first[reg.Indirect.ArrayID];
1520		else
1521			first = reg.Register.Index;
1522
1523		addr_reg = get_address_file_reg(ctx, reg.Indirect.Index);
1524
1525		/* pull the value from index_reg */
1526		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1527				   temp_reg, 0,
1528				   V_SQ_ALU_SRC_LITERAL, 16,
1529				   addr_reg, 0,
1530				   temp_reg, 0);
1531		if (r)
1532			return r;
1533
1534		param = r600_get_lds_unique_index(name[first],
1535						  index[first]);
1536
1537	} else {
1538		param = r600_get_lds_unique_index(name[reg.Register.Index],
1539						  index[reg.Register.Index]);
1540	}
1541
1542	/* add to base_addr - passed in temp_reg.x */
1543	if (param) {
1544		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1545				   temp_reg, 0,
1546				   temp_reg, 0,
1547				   V_SQ_ALU_SRC_LITERAL, param * 16);
1548		if (r)
1549			return r;
1550
1551	}
1552	return 0;
1553}
1554
1555static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
1556			       unsigned dst_reg)
1557{
1558	struct r600_bytecode_alu alu;
1559	int r, i;
1560
1561	if ((ctx->bc->cf_last->ndw>>1) >= 0x60)
1562		ctx->bc->force_add_cf = 1;
1563	for (i = 1; i < 4; i++) {
1564		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1565				   temp_reg, i,
1566				   temp_reg, 0,
1567				   V_SQ_ALU_SRC_LITERAL, 4 * i);
1568	}
1569	for (i = 0; i < 4; i++) {
1570		/* emit an LDS_READ_RET */
1571		memset(&alu, 0, sizeof(alu));
1572		alu.op = LDS_OP1_LDS_READ_RET;
1573		alu.src[0].sel = temp_reg;
1574		alu.src[0].chan = i;
1575		alu.src[1].sel = V_SQ_ALU_SRC_0;
1576		alu.src[2].sel = V_SQ_ALU_SRC_0;
1577		alu.dst.chan = 0;
1578		alu.is_lds_idx_op = true;
1579		alu.last = 1;
1580		r = r600_bytecode_add_alu(ctx->bc, &alu);
1581		if (r)
1582			return r;
1583	}
1584	for (i = 0; i < 4; i++) {
1585		/* then read from LDS_OQ_A_POP */
1586		memset(&alu, 0, sizeof(alu));
1587
1588		alu.op = ALU_OP1_MOV;
1589		alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
1590		alu.src[0].chan = 0;
1591		alu.dst.sel = dst_reg;
1592		alu.dst.chan = i;
1593		alu.dst.write = 1;
1594		alu.last = 1;
1595		r = r600_bytecode_add_alu(ctx->bc, &alu);
1596		if (r)
1597			return r;
1598	}
1599	return 0;
1600}
1601
1602static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1603{
1604	int r;
1605	unsigned temp_reg = r600_get_temp(ctx);
1606
1607	r = get_lds_offset0(ctx, 2, temp_reg,
1608			    src->Register.Dimension ? false : true);
1609	if (r)
1610		return r;
1611
1612	/* the base address is now in temp.x */
1613	r = r600_get_byte_address(ctx, temp_reg,
1614				  NULL, src, ctx->tess_output_info, 1);
1615	if (r)
1616		return r;
1617
1618	r = do_lds_fetch_values(ctx, temp_reg, dst_reg);
1619	if (r)
1620		return r;
1621	return 0;
1622}
1623
1624static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1625{
1626	int r;
1627	unsigned temp_reg = r600_get_temp(ctx);
1628
1629	/* t.x = ips * r0.y */
1630	r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
1631			   temp_reg, 0,
1632			   ctx->tess_input_info, 0,
1633			   0, 1);
1634
1635	if (r)
1636		return r;
1637
1638	/* the base address is now in temp.x */
1639	r = r600_get_byte_address(ctx, temp_reg,
1640				  NULL, src, ctx->tess_input_info, 1);
1641	if (r)
1642		return r;
1643
1644	r = do_lds_fetch_values(ctx, temp_reg, dst_reg);
1645	if (r)
1646		return r;
1647	return 0;
1648}
1649
1650static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1651{
1652	int r;
1653	unsigned temp_reg = r600_get_temp(ctx);
1654
1655	r = get_lds_offset0(ctx, 1, temp_reg,
1656			    src->Register.Dimension ? false : true);
1657	if (r)
1658		return r;
1659	/* the base address is now in temp.x */
1660	r = r600_get_byte_address(ctx, temp_reg,
1661				  NULL, src,
1662				  ctx->tess_output_info, 1);
1663	if (r)
1664		return r;
1665
1666	r = do_lds_fetch_values(ctx, temp_reg, dst_reg);
1667	if (r)
1668		return r;
1669	return 0;
1670}
1671
1672static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)
1673{
1674	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1675	int i;
1676
1677	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1678		struct tgsi_full_src_register *src = &inst->Src[i];
1679
1680		if (ctx->type == TGSI_PROCESSOR_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) {
1681			int treg = r600_get_temp(ctx);
1682			fetch_tes_input(ctx, src, treg);
1683			ctx->src[i].sel = treg;
1684			ctx->src[i].rel = 0;
1685		}
1686		if (ctx->type == TGSI_PROCESSOR_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) {
1687			int treg = r600_get_temp(ctx);
1688			fetch_tcs_input(ctx, src, treg);
1689			ctx->src[i].sel = treg;
1690			ctx->src[i].rel = 0;
1691		}
1692		if (ctx->type == TGSI_PROCESSOR_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) {
1693			int treg = r600_get_temp(ctx);
1694			fetch_tcs_output(ctx, src, treg);
1695			ctx->src[i].sel = treg;
1696			ctx->src[i].rel = 0;
1697		}
1698	}
1699	return 0;
1700}
1701
1702static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1703{
1704	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1705	struct r600_bytecode_alu alu;
1706	int i, j, k, nconst, r;
1707
1708	for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1709		if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1710			nconst++;
1711		}
1712		tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1713	}
1714	for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1715		if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1716			continue;
1717		}
1718
1719		if (ctx->src[i].rel) {
1720			int chan = inst->Src[i].Indirect.Swizzle;
1721			int treg = r600_get_temp(ctx);
1722			if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
1723				return r;
1724
1725			ctx->src[i].kc_bank = 0;
1726			ctx->src[i].kc_rel = 0;
1727			ctx->src[i].sel = treg;
1728			ctx->src[i].rel = 0;
1729			j--;
1730		} else if (j > 0) {
1731			int treg = r600_get_temp(ctx);
1732			for (k = 0; k < 4; k++) {
1733				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1734				alu.op = ALU_OP1_MOV;
1735				alu.src[0].sel = ctx->src[i].sel;
1736				alu.src[0].chan = k;
1737				alu.src[0].rel = ctx->src[i].rel;
1738				alu.src[0].kc_bank = ctx->src[i].kc_bank;
1739				alu.src[0].kc_rel = ctx->src[i].kc_rel;
1740				alu.dst.sel = treg;
1741				alu.dst.chan = k;
1742				alu.dst.write = 1;
1743				if (k == 3)
1744					alu.last = 1;
1745				r = r600_bytecode_add_alu(ctx->bc, &alu);
1746				if (r)
1747					return r;
1748			}
1749			ctx->src[i].sel = treg;
1750			ctx->src[i].rel =0;
1751			j--;
1752		}
1753	}
1754	return 0;
1755}
1756
1757/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
1758static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
1759{
1760	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1761	struct r600_bytecode_alu alu;
1762	int i, j, k, nliteral, r;
1763
1764	for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
1765		if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1766			nliteral++;
1767		}
1768	}
1769	for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
1770		if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1771			int treg = r600_get_temp(ctx);
1772			for (k = 0; k < 4; k++) {
1773				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1774				alu.op = ALU_OP1_MOV;
1775				alu.src[0].sel = ctx->src[i].sel;
1776				alu.src[0].chan = k;
1777				alu.src[0].value = ctx->src[i].value[k];
1778				alu.dst.sel = treg;
1779				alu.dst.chan = k;
1780				alu.dst.write = 1;
1781				if (k == 3)
1782					alu.last = 1;
1783				r = r600_bytecode_add_alu(ctx->bc, &alu);
1784				if (r)
1785					return r;
1786			}
1787			ctx->src[i].sel = treg;
1788			j--;
1789		}
1790	}
1791	return 0;
1792}
1793
1794static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
1795{
1796	int i, r, count = ctx->shader->ninput;
1797
1798	for (i = 0; i < count; i++) {
1799		if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1800			r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
1801			if (r)
1802				return r;
1803		}
1804	}
1805	return 0;
1806}
1807
1808static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
1809						  int stream, unsigned *stream_item_size)
1810{
1811	unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
1812	unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
1813	int i, j, r;
1814
1815	/* Sanity checking. */
1816	if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
1817		R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
1818		r = -EINVAL;
1819		goto out_err;
1820	}
1821	for (i = 0; i < so->num_outputs; i++) {
1822		if (so->output[i].output_buffer >= 4) {
1823			R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
1824				 so->output[i].output_buffer);
1825			r = -EINVAL;
1826			goto out_err;
1827		}
1828	}
1829
1830	/* Initialize locations where the outputs are stored. */
1831	for (i = 0; i < so->num_outputs; i++) {
1832
1833		so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
1834		start_comp[i] = so->output[i].start_component;
1835		/* Lower outputs with dst_offset < start_component.
1836		 *
1837		 * We can only output 4D vectors with a write mask, e.g. we can
1838		 * only output the W component at offset 3, etc. If we want
1839		 * to store Y, Z, or W at buffer offset 0, we need to use MOV
1840		 * to move it to X and output X. */
1841		if (so->output[i].dst_offset < so->output[i].start_component) {
1842			unsigned tmp = r600_get_temp(ctx);
1843
1844			for (j = 0; j < so->output[i].num_components; j++) {
1845				struct r600_bytecode_alu alu;
1846				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1847				alu.op = ALU_OP1_MOV;
1848				alu.src[0].sel = so_gpr[i];
1849				alu.src[0].chan = so->output[i].start_component + j;
1850
1851				alu.dst.sel = tmp;
1852				alu.dst.chan = j;
1853				alu.dst.write = 1;
1854				if (j == so->output[i].num_components - 1)
1855					alu.last = 1;
1856				r = r600_bytecode_add_alu(ctx->bc, &alu);
1857				if (r)
1858					return r;
1859			}
1860			start_comp[i] = 0;
1861			so_gpr[i] = tmp;
1862		}
1863	}
1864
1865	/* Write outputs to buffers. */
1866	for (i = 0; i < so->num_outputs; i++) {
1867		struct r600_bytecode_output output;
1868
1869		if (stream != -1 && stream != so->output[i].output_buffer)
1870			continue;
1871
1872		memset(&output, 0, sizeof(struct r600_bytecode_output));
1873		output.gpr = so_gpr[i];
1874		output.elem_size = so->output[i].num_components - 1;
1875		if (output.elem_size == 2)
1876			output.elem_size = 3; // 3 not supported, write 4 with junk at end
1877		output.array_base = so->output[i].dst_offset - start_comp[i];
1878		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1879		output.burst_count = 1;
1880		/* array_size is an upper limit for the burst_count
1881		 * with MEM_STREAM instructions */
1882		output.array_size = 0xFFF;
1883		output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
1884
1885		if (ctx->bc->chip_class >= EVERGREEN) {
1886			switch (so->output[i].output_buffer) {
1887			case 0:
1888				output.op = CF_OP_MEM_STREAM0_BUF0;
1889				break;
1890			case 1:
1891				output.op = CF_OP_MEM_STREAM0_BUF1;
1892				break;
1893			case 2:
1894				output.op = CF_OP_MEM_STREAM0_BUF2;
1895				break;
1896			case 3:
1897				output.op = CF_OP_MEM_STREAM0_BUF3;
1898				break;
1899			}
1900			output.op += so->output[i].stream * 4;
1901			assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
1902			ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
1903		} else {
1904			switch (so->output[i].output_buffer) {
1905			case 0:
1906				output.op = CF_OP_MEM_STREAM0;
1907				break;
1908			case 1:
1909				output.op = CF_OP_MEM_STREAM1;
1910				break;
1911			case 2:
1912				output.op = CF_OP_MEM_STREAM2;
1913				break;
1914			case 3:
1915				output.op = CF_OP_MEM_STREAM3;
1916					break;
1917			}
1918			ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
1919		}
1920		r = r600_bytecode_add_output(ctx->bc, &output);
1921		if (r)
1922			goto out_err;
1923	}
1924	return 0;
1925out_err:
1926	return r;
1927}
1928
1929static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
1930{
1931	struct r600_bytecode_alu alu;
1932	unsigned reg;
1933
1934	if (!ctx->shader->vs_out_edgeflag)
1935		return;
1936
1937	reg = ctx->shader->output[ctx->edgeflag_output].gpr;
1938
1939	/* clamp(x, 0, 1) */
1940	memset(&alu, 0, sizeof(alu));
1941	alu.op = ALU_OP1_MOV;
1942	alu.src[0].sel = reg;
1943	alu.dst.sel = reg;
1944	alu.dst.write = 1;
1945	alu.dst.clamp = 1;
1946	alu.last = 1;
1947	r600_bytecode_add_alu(ctx->bc, &alu);
1948
1949	memset(&alu, 0, sizeof(alu));
1950	alu.op = ALU_OP1_FLT_TO_INT;
1951	alu.src[0].sel = reg;
1952	alu.dst.sel = reg;
1953	alu.dst.write = 1;
1954	alu.last = 1;
1955	r600_bytecode_add_alu(ctx->bc, &alu);
1956}
1957
1958static int generate_gs_copy_shader(struct r600_context *rctx,
1959				   struct r600_pipe_shader *gs,
1960				   struct pipe_stream_output_info *so)
1961{
1962	struct r600_shader_ctx ctx = {};
1963	struct r600_shader *gs_shader = &gs->shader;
1964	struct r600_pipe_shader *cshader;
1965	int ocnt = gs_shader->noutput;
1966	struct r600_bytecode_alu alu;
1967	struct r600_bytecode_vtx vtx;
1968	struct r600_bytecode_output output;
1969	struct r600_bytecode_cf *cf_jump, *cf_pop,
1970		*last_exp_pos = NULL, *last_exp_param = NULL;
1971	int i, j, next_clip_pos = 61, next_param = 0;
1972	int ring;
1973
1974	cshader = calloc(1, sizeof(struct r600_pipe_shader));
1975	if (!cshader)
1976		return 0;
1977
1978	memcpy(cshader->shader.output, gs_shader->output, ocnt *
1979	       sizeof(struct r600_shader_io));
1980
1981	cshader->shader.noutput = ocnt;
1982
1983	ctx.shader = &cshader->shader;
1984	ctx.bc = &ctx.shader->bc;
1985	ctx.type = ctx.bc->type = TGSI_PROCESSOR_VERTEX;
1986
1987	r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
1988			   rctx->screen->has_compressed_msaa_texturing);
1989
1990	ctx.bc->isa = rctx->isa;
1991
1992	cf_jump = NULL;
1993	memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
1994
1995	/* R0.x = R0.x & 0x3fffffff */
1996	memset(&alu, 0, sizeof(alu));
1997	alu.op = ALU_OP2_AND_INT;
1998	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1999	alu.src[1].value = 0x3fffffff;
2000	alu.dst.write = 1;
2001	r600_bytecode_add_alu(ctx.bc, &alu);
2002
2003	/* R0.y = R0.x >> 30 */
2004	memset(&alu, 0, sizeof(alu));
2005	alu.op = ALU_OP2_LSHR_INT;
2006	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2007	alu.src[1].value = 0x1e;
2008	alu.dst.chan = 1;
2009	alu.dst.write = 1;
2010	alu.last = 1;
2011	r600_bytecode_add_alu(ctx.bc, &alu);
2012
2013	/* fetch vertex data from GSVS ring */
2014	for (i = 0; i < ocnt; ++i) {
2015		struct r600_shader_io *out = &ctx.shader->output[i];
2016
2017		out->gpr = i + 1;
2018		out->ring_offset = i * 16;
2019
2020		memset(&vtx, 0, sizeof(vtx));
2021		vtx.op = FETCH_OP_VFETCH;
2022		vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
2023		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2024		vtx.mega_fetch_count = 16;
2025		vtx.offset = out->ring_offset;
2026		vtx.dst_gpr = out->gpr;
2027		vtx.src_gpr = 0;
2028		vtx.dst_sel_x = 0;
2029		vtx.dst_sel_y = 1;
2030		vtx.dst_sel_z = 2;
2031		vtx.dst_sel_w = 3;
2032		if (rctx->b.chip_class >= EVERGREEN) {
2033			vtx.use_const_fields = 1;
2034		} else {
2035			vtx.data_format = FMT_32_32_32_32_FLOAT;
2036		}
2037
2038		r600_bytecode_add_vtx(ctx.bc, &vtx);
2039	}
2040	ctx.temp_reg = i + 1;
2041	for (ring = 3; ring >= 0; --ring) {
2042		bool enabled = false;
2043		for (i = 0; i < so->num_outputs; i++) {
2044			if (so->output[i].stream == ring) {
2045				enabled = true;
2046				break;
2047			}
2048		}
2049		if (ring != 0 && !enabled) {
2050			cshader->shader.ring_item_sizes[ring] = 0;
2051			continue;
2052		}
2053
2054		if (cf_jump) {
2055			// Patch up jump label
2056			r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2057			cf_pop = ctx.bc->cf_last;
2058
2059			cf_jump->cf_addr = cf_pop->id + 2;
2060			cf_jump->pop_count = 1;
2061			cf_pop->cf_addr = cf_pop->id + 2;
2062			cf_pop->pop_count = 1;
2063		}
2064
2065		/* PRED_SETE_INT __, R0.y, ring */
2066		memset(&alu, 0, sizeof(alu));
2067		alu.op = ALU_OP2_PRED_SETE_INT;
2068		alu.src[0].chan = 1;
2069		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2070		alu.src[1].value = ring;
2071		alu.execute_mask = 1;
2072		alu.update_pred = 1;
2073		alu.last = 1;
2074		r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2075
2076		r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
2077		cf_jump = ctx.bc->cf_last;
2078
2079		if (enabled)
2080			emit_streamout(&ctx, so, ring, &cshader->shader.ring_item_sizes[ring]);
2081		cshader->shader.ring_item_sizes[ring] = ocnt * 16;
2082	}
2083
2084	/* bc adds nops - copy it */
2085	if (ctx.bc->chip_class == R600) {
2086		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2087		alu.op = ALU_OP0_NOP;
2088		alu.last = 1;
2089		r600_bytecode_add_alu(ctx.bc, &alu);
2090
2091		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2092	}
2093
2094	/* export vertex data */
2095	/* XXX factor out common code with r600_shader_from_tgsi ? */
2096	for (i = 0; i < ocnt; ++i) {
2097		struct r600_shader_io *out = &ctx.shader->output[i];
2098		bool instream0 = true;
2099		if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
2100			continue;
2101
2102		for (j = 0; j < so->num_outputs; j++) {
2103			if (so->output[j].register_index == i) {
2104				if (so->output[j].stream == 0)
2105					break;
2106				if (so->output[j].stream > 0)
2107					instream0 = false;
2108			}
2109		}
2110		if (!instream0)
2111			continue;
2112		memset(&output, 0, sizeof(output));
2113		output.gpr = out->gpr;
2114		output.elem_size = 3;
2115		output.swizzle_x = 0;
2116		output.swizzle_y = 1;
2117		output.swizzle_z = 2;
2118		output.swizzle_w = 3;
2119		output.burst_count = 1;
2120		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2121		output.op = CF_OP_EXPORT;
2122		switch (out->name) {
2123		case TGSI_SEMANTIC_POSITION:
2124			output.array_base = 60;
2125			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2126			break;
2127
2128		case TGSI_SEMANTIC_PSIZE:
2129			output.array_base = 61;
2130			if (next_clip_pos == 61)
2131				next_clip_pos = 62;
2132			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2133			output.swizzle_y = 7;
2134			output.swizzle_z = 7;
2135			output.swizzle_w = 7;
2136			ctx.shader->vs_out_misc_write = 1;
2137			ctx.shader->vs_out_point_size = 1;
2138			break;
2139		case TGSI_SEMANTIC_LAYER:
2140			if (out->spi_sid) {
2141				/* duplicate it as PARAM to pass to the pixel shader */
2142				output.array_base = next_param++;
2143				r600_bytecode_add_output(ctx.bc, &output);
2144				last_exp_param = ctx.bc->cf_last;
2145			}
2146			output.array_base = 61;
2147			if (next_clip_pos == 61)
2148				next_clip_pos = 62;
2149			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2150			output.swizzle_x = 7;
2151			output.swizzle_y = 7;
2152			output.swizzle_z = 0;
2153			output.swizzle_w = 7;
2154			ctx.shader->vs_out_misc_write = 1;
2155			ctx.shader->vs_out_layer = 1;
2156			break;
2157		case TGSI_SEMANTIC_VIEWPORT_INDEX:
2158			if (out->spi_sid) {
2159				/* duplicate it as PARAM to pass to the pixel shader */
2160				output.array_base = next_param++;
2161				r600_bytecode_add_output(ctx.bc, &output);
2162				last_exp_param = ctx.bc->cf_last;
2163			}
2164			output.array_base = 61;
2165			if (next_clip_pos == 61)
2166				next_clip_pos = 62;
2167			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2168			ctx.shader->vs_out_misc_write = 1;
2169			ctx.shader->vs_out_viewport = 1;
2170			output.swizzle_x = 7;
2171			output.swizzle_y = 7;
2172			output.swizzle_z = 7;
2173			output.swizzle_w = 0;
2174			break;
2175		case TGSI_SEMANTIC_CLIPDIST:
2176			/* spi_sid is 0 for clipdistance outputs that were generated
2177			 * for clipvertex - we don't need to pass them to PS */
2178			ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
2179			if (out->spi_sid) {
2180				/* duplicate it as PARAM to pass to the pixel shader */
2181				output.array_base = next_param++;
2182				r600_bytecode_add_output(ctx.bc, &output);
2183				last_exp_param = ctx.bc->cf_last;
2184			}
2185			output.array_base = next_clip_pos++;
2186			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2187			break;
2188		case TGSI_SEMANTIC_FOG:
2189			output.swizzle_y = 4; /* 0 */
2190			output.swizzle_z = 4; /* 0 */
2191			output.swizzle_w = 5; /* 1 */
2192			break;
2193		default:
2194			output.array_base = next_param++;
2195			break;
2196		}
2197		r600_bytecode_add_output(ctx.bc, &output);
2198		if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
2199			last_exp_param = ctx.bc->cf_last;
2200		else
2201			last_exp_pos = ctx.bc->cf_last;
2202	}
2203
2204	if (!last_exp_pos) {
2205		memset(&output, 0, sizeof(output));
2206		output.gpr = 0;
2207		output.elem_size = 3;
2208		output.swizzle_x = 7;
2209		output.swizzle_y = 7;
2210		output.swizzle_z = 7;
2211		output.swizzle_w = 7;
2212		output.burst_count = 1;
2213		output.type = 2;
2214		output.op = CF_OP_EXPORT;
2215		output.array_base = 60;
2216		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2217		r600_bytecode_add_output(ctx.bc, &output);
2218		last_exp_pos = ctx.bc->cf_last;
2219	}
2220
2221	if (!last_exp_param) {
2222		memset(&output, 0, sizeof(output));
2223		output.gpr = 0;
2224		output.elem_size = 3;
2225		output.swizzle_x = 7;
2226		output.swizzle_y = 7;
2227		output.swizzle_z = 7;
2228		output.swizzle_w = 7;
2229		output.burst_count = 1;
2230		output.type = 2;
2231		output.op = CF_OP_EXPORT;
2232		output.array_base = next_param++;
2233		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2234		r600_bytecode_add_output(ctx.bc, &output);
2235		last_exp_param = ctx.bc->cf_last;
2236	}
2237
2238	last_exp_pos->op = CF_OP_EXPORT_DONE;
2239	last_exp_param->op = CF_OP_EXPORT_DONE;
2240
2241	r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2242	cf_pop = ctx.bc->cf_last;
2243
2244	cf_jump->cf_addr = cf_pop->id + 2;
2245	cf_jump->pop_count = 1;
2246	cf_pop->cf_addr = cf_pop->id + 2;
2247	cf_pop->pop_count = 1;
2248
2249	if (ctx.bc->chip_class == CAYMAN)
2250		cm_bytecode_add_cf_end(ctx.bc);
2251	else {
2252		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2253		ctx.bc->cf_last->end_of_program = 1;
2254	}
2255
2256	gs->gs_copy_shader = cshader;
2257	cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
2258
2259	ctx.bc->nstack = 1;
2260
2261	return r600_bytecode_build(ctx.bc);
2262}
2263
2264static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind)
2265{
2266	if (ind) {
2267		struct r600_bytecode_alu alu;
2268		int r;
2269
2270		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2271		alu.op = ALU_OP2_ADD_INT;
2272		alu.src[0].sel = ctx->gs_export_gpr_tregs[idx];
2273		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2274		alu.src[1].value = ctx->gs_out_ring_offset >> 4;
2275		alu.dst.sel = ctx->gs_export_gpr_tregs[idx];
2276		alu.dst.write = 1;
2277		alu.last = 1;
2278		r = r600_bytecode_add_alu(ctx->bc, &alu);
2279		if (r)
2280			return r;
2281	}
2282	return 0;
2283}
2284
2285static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind)
2286{
2287	struct r600_bytecode_output output;
2288	int i, k, ring_offset;
2289	int effective_stream = stream == -1 ? 0 : stream;
2290	int idx = 0;
2291
2292	for (i = 0; i < ctx->shader->noutput; i++) {
2293		if (ctx->gs_for_vs) {
2294			/* for ES we need to lookup corresponding ring offset expected by GS
2295			 * (map this output to GS input by name and sid) */
2296			/* FIXME precompute offsets */
2297			ring_offset = -1;
2298			for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
2299				struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
2300				struct r600_shader_io *out = &ctx->shader->output[i];
2301				if (in->name == out->name && in->sid == out->sid)
2302					ring_offset = in->ring_offset;
2303			}
2304
2305			if (ring_offset == -1)
2306				continue;
2307		} else {
2308			ring_offset = idx * 16;
2309			idx++;
2310		}
2311
2312		if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
2313			continue;
2314		/* next_ring_offset after parsing input decls contains total size of
2315		 * single vertex data, gs_next_vertex - current vertex index */
2316		if (!ind)
2317			ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
2318
2319		memset(&output, 0, sizeof(struct r600_bytecode_output));
2320		output.gpr = ctx->shader->output[i].gpr;
2321		output.elem_size = 3;
2322		output.comp_mask = 0xF;
2323		output.burst_count = 1;
2324
2325		if (ind)
2326			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
2327		else
2328			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2329
2330		switch (stream) {
2331		default:
2332		case 0:
2333			output.op = CF_OP_MEM_RING; break;
2334		case 1:
2335			output.op = CF_OP_MEM_RING1; break;
2336		case 2:
2337			output.op = CF_OP_MEM_RING2; break;
2338		case 3:
2339			output.op = CF_OP_MEM_RING3; break;
2340		}
2341
2342		if (ind) {
2343			output.array_base = ring_offset >> 2; /* in dwords */
2344			output.array_size = 0xfff;
2345			output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
2346		} else
2347			output.array_base = ring_offset >> 2; /* in dwords */
2348		r600_bytecode_add_output(ctx->bc, &output);
2349	}
2350
2351	++ctx->gs_next_vertex;
2352	return 0;
2353}
2354
2355
2356static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
2357{
2358	int r;
2359	struct r600_bytecode_vtx vtx;
2360	int temp_val = ctx->temp_reg;
2361	/* need to store the TCS output somewhere */
2362	r = single_alu_op2(ctx, ALU_OP1_MOV,
2363			   temp_val, 0,
2364			   V_SQ_ALU_SRC_LITERAL, 0,
2365			   0, 0);
2366	if (r)
2367		return r;
2368
2369	/* used by VS/TCS */
2370	if (ctx->tess_input_info) {
2371		/* fetch tcs input values into resv space */
2372		memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2373		vtx.op = FETCH_OP_VFETCH;
2374		vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2375		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2376		vtx.mega_fetch_count = 16;
2377		vtx.data_format = FMT_32_32_32_32;
2378		vtx.num_format_all = 2;
2379		vtx.format_comp_all = 1;
2380		vtx.use_const_fields = 0;
2381		vtx.endian = r600_endian_swap(32);
2382		vtx.srf_mode_all = 1;
2383		vtx.offset = 0;
2384		vtx.dst_gpr = ctx->tess_input_info;
2385		vtx.dst_sel_x = 0;
2386		vtx.dst_sel_y = 1;
2387		vtx.dst_sel_z = 2;
2388		vtx.dst_sel_w = 3;
2389		vtx.src_gpr = temp_val;
2390		vtx.src_sel_x = 0;
2391
2392		r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2393		if (r)
2394			return r;
2395	}
2396
2397	/* used by TCS/TES */
2398	if (ctx->tess_output_info) {
2399		/* fetch tcs output values into resv space */
2400		memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2401		vtx.op = FETCH_OP_VFETCH;
2402		vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2403		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2404		vtx.mega_fetch_count = 16;
2405		vtx.data_format = FMT_32_32_32_32;
2406		vtx.num_format_all = 2;
2407		vtx.format_comp_all = 1;
2408		vtx.use_const_fields = 0;
2409		vtx.endian = r600_endian_swap(32);
2410		vtx.srf_mode_all = 1;
2411		vtx.offset = 16;
2412		vtx.dst_gpr = ctx->tess_output_info;
2413		vtx.dst_sel_x = 0;
2414		vtx.dst_sel_y = 1;
2415		vtx.dst_sel_z = 2;
2416		vtx.dst_sel_w = 3;
2417		vtx.src_gpr = temp_val;
2418		vtx.src_sel_x = 0;
2419
2420		r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2421		if (r)
2422			return r;
2423	}
2424	return 0;
2425}
2426
2427static int emit_lds_vs_writes(struct r600_shader_ctx *ctx)
2428{
2429	int i, j, r;
2430	int temp_reg;
2431
2432	/* fetch tcs input values into input_vals */
2433	ctx->tess_input_info = r600_get_temp(ctx);
2434	ctx->tess_output_info = 0;
2435	r = r600_fetch_tess_io_info(ctx);
2436	if (r)
2437		return r;
2438
2439	temp_reg = r600_get_temp(ctx);
2440	/* dst reg contains LDS address stride * idx */
2441	/* MUL vertexID, vertex_dw_stride */
2442	r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
2443			   temp_reg, 0,
2444			   ctx->tess_input_info, 1,
2445			   0, 1); /* rel id in r0.y? */
2446	if (r)
2447		return r;
2448
2449	for (i = 0; i < ctx->shader->noutput; i++) {
2450		struct r600_bytecode_alu alu;
2451		int param = r600_get_lds_unique_index(ctx->shader->output[i].name, ctx->shader->output[i].sid);
2452
2453		if (param) {
2454			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2455					   temp_reg, 1,
2456					   temp_reg, 0,
2457					   V_SQ_ALU_SRC_LITERAL, param * 16);
2458			if (r)
2459				return r;
2460		}
2461
2462		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2463				   temp_reg, 2,
2464				   temp_reg, param ? 1 : 0,
2465				   V_SQ_ALU_SRC_LITERAL, 8);
2466		if (r)
2467			return r;
2468
2469
2470		for (j = 0; j < 2; j++) {
2471			int chan = (j == 1) ? 2 : (param ? 1 : 0);
2472			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2473			alu.op = LDS_OP3_LDS_WRITE_REL;
2474			alu.src[0].sel = temp_reg;
2475			alu.src[0].chan = chan;
2476			alu.src[1].sel = ctx->shader->output[i].gpr;
2477			alu.src[1].chan = j * 2;
2478			alu.src[2].sel = ctx->shader->output[i].gpr;
2479			alu.src[2].chan = (j * 2) + 1;
2480			alu.last = 1;
2481			alu.dst.chan = 0;
2482			alu.lds_idx = 1;
2483			alu.is_lds_idx_op = true;
2484			r = r600_bytecode_add_alu(ctx->bc, &alu);
2485			if (r)
2486				return r;
2487		}
2488	}
2489	return 0;
2490}
2491
2492static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
2493{
2494	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2495	const struct tgsi_full_dst_register *dst = &inst->Dst[0];
2496	int i, r, lasti;
2497	int temp_reg = r600_get_temp(ctx);
2498	struct r600_bytecode_alu alu;
2499	unsigned write_mask = dst->Register.WriteMask;
2500
2501	if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
2502		return 0;
2503
2504	r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true);
2505	if (r)
2506		return r;
2507
2508	/* the base address is now in temp.x */
2509	r = r600_get_byte_address(ctx, temp_reg,
2510				  &inst->Dst[0], NULL, ctx->tess_output_info, 1);
2511	if (r)
2512		return r;
2513
2514	/* LDS write */
2515	lasti = tgsi_last_instruction(write_mask);
2516	for (i = 1; i <= lasti; i++) {
2517
2518		if (!(write_mask & (1 << i)))
2519			continue;
2520		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2521				   temp_reg, i,
2522				   temp_reg, 0,
2523				   V_SQ_ALU_SRC_LITERAL, 4 * i);
2524		if (r)
2525			return r;
2526	}
2527
2528	for (i = 0; i <= lasti; i++) {
2529		if (!(write_mask & (1 << i)))
2530			continue;
2531
2532		if ((i == 0 && ((write_mask & 3) == 3)) ||
2533		    (i == 2 && ((write_mask & 0xc) == 0xc))) {
2534			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2535			alu.op = LDS_OP3_LDS_WRITE_REL;
2536			alu.src[0].sel = temp_reg;
2537			alu.src[0].chan = i;
2538
2539			alu.src[1].sel = dst->Register.Index;
2540			alu.src[1].sel += ctx->file_offset[dst->Register.File];
2541			alu.src[1].chan = i;
2542
2543			alu.src[2].sel = dst->Register.Index;
2544			alu.src[2].sel += ctx->file_offset[dst->Register.File];
2545			alu.src[2].chan = i + 1;
2546			alu.lds_idx = 1;
2547			alu.dst.chan = 0;
2548			alu.last = 1;
2549			alu.is_lds_idx_op = true;
2550			r = r600_bytecode_add_alu(ctx->bc, &alu);
2551			if (r)
2552				return r;
2553			i += 1;
2554			continue;
2555		}
2556		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2557		alu.op = LDS_OP2_LDS_WRITE;
2558		alu.src[0].sel = temp_reg;
2559		alu.src[0].chan = i;
2560
2561		alu.src[1].sel = dst->Register.Index;
2562		alu.src[1].sel += ctx->file_offset[dst->Register.File];
2563		alu.src[1].chan = i;
2564
2565		alu.src[2].sel = V_SQ_ALU_SRC_0;
2566		alu.dst.chan = 0;
2567		alu.last = 1;
2568		alu.is_lds_idx_op = true;
2569		r = r600_bytecode_add_alu(ctx->bc, &alu);
2570		if (r)
2571			return r;
2572	}
2573	return 0;
2574}
2575
2576static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
2577				 int output_idx)
2578{
2579	int param;
2580	unsigned temp_reg = r600_get_temp(ctx);
2581	unsigned name = ctx->shader->output[output_idx].name;
2582	int dreg = ctx->shader->output[output_idx].gpr;
2583	int r;
2584
2585	param = r600_get_lds_unique_index(name, 0);
2586	r = get_lds_offset0(ctx, 1, temp_reg, true);
2587	if (r)
2588		return r;
2589
2590	r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2591			   temp_reg, 0,
2592			   temp_reg, 0,
2593			   V_SQ_ALU_SRC_LITERAL, param * 16);
2594	if (r)
2595		return r;
2596
2597	do_lds_fetch_values(ctx, temp_reg, dreg);
2598	return 0;
2599}
2600
2601static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)
2602{
2603	int i;
2604	int stride, outer_comps, inner_comps;
2605	int tessinner_idx = -1, tessouter_idx = -1;
2606	int r;
2607	int temp_reg = r600_get_temp(ctx);
2608	int treg[3] = {-1, -1, -1};
2609	struct r600_bytecode_alu alu;
2610	struct r600_bytecode_cf *cf_jump, *cf_pop;
2611
2612	/* only execute factor emission for invocation 0 */
2613	/* PRED_SETE_INT __, R0.x, 0 */
2614	memset(&alu, 0, sizeof(alu));
2615	alu.op = ALU_OP2_PRED_SETE_INT;
2616	alu.src[0].chan = 2;
2617	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2618	alu.execute_mask = 1;
2619	alu.update_pred = 1;
2620	alu.last = 1;
2621	r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2622
2623	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
2624	cf_jump = ctx->bc->cf_last;
2625
2626	treg[0] = r600_get_temp(ctx);
2627	switch (ctx->shader->tcs_prim_mode) {
2628	case PIPE_PRIM_LINES:
2629		stride = 8; /* 2 dwords, 1 vec2 store */
2630		outer_comps = 2;
2631		inner_comps = 0;
2632		break;
2633	case PIPE_PRIM_TRIANGLES:
2634		stride = 16; /* 4 dwords, 1 vec4 store */
2635		outer_comps = 3;
2636		inner_comps = 1;
2637		treg[1] = r600_get_temp(ctx);
2638		break;
2639	case PIPE_PRIM_QUADS:
2640		stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */
2641		outer_comps = 4;
2642		inner_comps = 2;
2643		treg[1] = r600_get_temp(ctx);
2644		treg[2] = r600_get_temp(ctx);
2645		break;
2646	default:
2647		assert(0);
2648		return -1;
2649	}
2650
2651	/* R0 is InvocationID, RelPatchID, PatchID, tf_base */
2652	/* TF_WRITE takes index in R.x, value in R.y */
2653	for (i = 0; i < ctx->shader->noutput; i++) {
2654		if (ctx->shader->output[i].name == TGSI_SEMANTIC_TESSINNER)
2655			tessinner_idx = i;
2656		if (ctx->shader->output[i].name == TGSI_SEMANTIC_TESSOUTER)
2657			tessouter_idx = i;
2658	}
2659
2660	if (tessouter_idx == -1)
2661		return -1;
2662
2663	if (tessinner_idx == -1 && inner_comps)
2664		return -1;
2665
2666	if (tessouter_idx != -1) {
2667		r = r600_tess_factor_read(ctx, tessouter_idx);
2668		if (r)
2669			return r;
2670	}
2671
2672	if (tessinner_idx != -1) {
2673		r = r600_tess_factor_read(ctx, tessinner_idx);
2674		if (r)
2675			return r;
2676	}
2677
2678	/* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */
2679	/* r.x = relpatchid(r0.y) * tf_stride */
2680
2681	/* multiply incoming r0.y * stride - t.x = r0.y * stride */
2682	/* add incoming r0.w to it: t.x = t.x + r0.w */
2683	r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
2684			   temp_reg, 0,
2685			   0, 1,
2686			   V_SQ_ALU_SRC_LITERAL, stride,
2687			   0, 3);
2688	if (r)
2689		return r;
2690
2691	for (i = 0; i < outer_comps + inner_comps; i++) {
2692		int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx;
2693		int out_comp = i >= outer_comps ? i - outer_comps : i;
2694
2695		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2696				   treg[i / 2], (2 * (i % 2)),
2697				   temp_reg, 0,
2698				   V_SQ_ALU_SRC_LITERAL, 4 * i);
2699		if (r)
2700			return r;
2701		r = single_alu_op2(ctx, ALU_OP1_MOV,
2702				   treg[i / 2], 1 + (2 * (i%2)),
2703				   ctx->shader->output[out_idx].gpr, out_comp,
2704				   0, 0);
2705		if (r)
2706			return r;
2707	}
2708	for (i = 0; i < outer_comps + inner_comps; i++) {
2709		struct r600_bytecode_gds gds;
2710
2711		memset(&gds, 0, sizeof(struct r600_bytecode_gds));
2712		gds.src_gpr = treg[i / 2];
2713		gds.src_sel_x = 2 * (i % 2);
2714		gds.src_sel_y = 1 + (2 * (i % 2));
2715		gds.src_sel_z = 4;
2716		gds.dst_sel_x = 7;
2717		gds.dst_sel_y = 7;
2718		gds.dst_sel_z = 7;
2719		gds.dst_sel_w = 7;
2720		gds.op = FETCH_OP_TF_WRITE;
2721		r = r600_bytecode_add_gds(ctx->bc, &gds);
2722		if (r)
2723			return r;
2724	}
2725
2726	// Patch up jump label
2727	r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
2728	cf_pop = ctx->bc->cf_last;
2729
2730	cf_jump->cf_addr = cf_pop->id + 2;
2731	cf_jump->pop_count = 1;
2732	cf_pop->cf_addr = cf_pop->id + 2;
2733	cf_pop->pop_count = 1;
2734
2735	return 0;
2736}
2737
2738static int r600_shader_from_tgsi(struct r600_context *rctx,
2739				 struct r600_pipe_shader *pipeshader,
2740				 union r600_shader_key key)
2741{
2742	struct r600_screen *rscreen = rctx->screen;
2743	struct r600_shader *shader = &pipeshader->shader;
2744	struct tgsi_token *tokens = pipeshader->selector->tokens;
2745	struct pipe_stream_output_info so = pipeshader->selector->so;
2746	struct tgsi_full_immediate *immediate;
2747	struct r600_shader_ctx ctx;
2748	struct r600_bytecode_output output[32];
2749	unsigned output_done, noutput;
2750	unsigned opcode;
2751	int i, j, k, r = 0;
2752	int next_param_base = 0, next_clip_base;
2753	int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
2754	/* Declarations used by llvm code */
2755	bool use_llvm = false;
2756	bool indirect_gprs;
2757	bool ring_outputs = false;
2758	bool lds_outputs = false;
2759	bool lds_inputs = false;
2760	bool pos_emitted = false;
2761
2762#ifdef R600_USE_LLVM
2763	use_llvm = rscreen->b.debug_flags & DBG_LLVM;
2764#endif
2765	ctx.bc = &shader->bc;
2766	ctx.shader = shader;
2767	ctx.native_integers = true;
2768
2769
2770	r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
2771			   rscreen->has_compressed_msaa_texturing);
2772	ctx.tokens = tokens;
2773	tgsi_scan_shader(tokens, &ctx.info);
2774	shader->indirect_files = ctx.info.indirect_files;
2775
2776	shader->uses_doubles = ctx.info.uses_doubles;
2777
2778	indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
2779	tgsi_parse_init(&ctx.parse, tokens);
2780	ctx.type = ctx.info.processor;
2781	shader->processor_type = ctx.type;
2782	ctx.bc->type = shader->processor_type;
2783
2784	switch (ctx.type) {
2785	case TGSI_PROCESSOR_VERTEX:
2786		shader->vs_as_gs_a = key.vs.as_gs_a;
2787		shader->vs_as_es = key.vs.as_es;
2788		shader->vs_as_ls = key.vs.as_ls;
2789		if (shader->vs_as_es)
2790			ring_outputs = true;
2791		if (shader->vs_as_ls)
2792			lds_outputs = true;
2793		break;
2794	case TGSI_PROCESSOR_GEOMETRY:
2795		ring_outputs = true;
2796		break;
2797	case TGSI_PROCESSOR_TESS_CTRL:
2798		shader->tcs_prim_mode = key.tcs.prim_mode;
2799		lds_outputs = true;
2800		lds_inputs = true;
2801		break;
2802	case TGSI_PROCESSOR_TESS_EVAL:
2803		shader->tes_as_es = key.tes.as_es;
2804		lds_inputs = true;
2805		if (shader->tes_as_es)
2806			ring_outputs = true;
2807		break;
2808	case TGSI_PROCESSOR_FRAGMENT:
2809		shader->two_side = key.ps.color_two_side;
2810		break;
2811	default:
2812		break;
2813	}
2814
2815	if (shader->vs_as_es || shader->tes_as_es) {
2816		ctx.gs_for_vs = &rctx->gs_shader->current->shader;
2817	} else {
2818		ctx.gs_for_vs = NULL;
2819	}
2820
2821	ctx.next_ring_offset = 0;
2822	ctx.gs_out_ring_offset = 0;
2823	ctx.gs_next_vertex = 0;
2824	ctx.gs_stream_output_info = &so;
2825
2826	ctx.face_gpr = -1;
2827	ctx.fixed_pt_position_gpr = -1;
2828	ctx.fragcoord_input = -1;
2829	ctx.colors_used = 0;
2830	ctx.clip_vertex_write = 0;
2831
2832	shader->nr_ps_color_exports = 0;
2833	shader->nr_ps_max_color_exports = 0;
2834
2835
2836	/* register allocations */
2837	/* Values [0,127] correspond to GPR[0..127].
2838	 * Values [128,159] correspond to constant buffer bank 0
2839	 * Values [160,191] correspond to constant buffer bank 1
2840	 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
2841	 * Values [256,287] correspond to constant buffer bank 2 (EG)
2842	 * Values [288,319] correspond to constant buffer bank 3 (EG)
2843	 * Other special values are shown in the list below.
2844	 * 244  ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
2845	 * 245  ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
2846	 * 246  ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
2847	 * 247  ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
2848	 * 248	SQ_ALU_SRC_0: special constant 0.0.
2849	 * 249	SQ_ALU_SRC_1: special constant 1.0 float.
2850	 * 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
2851	 * 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
2852	 * 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
2853	 * 253	SQ_ALU_SRC_LITERAL: literal constant.
2854	 * 254	SQ_ALU_SRC_PV: previous vector result.
2855	 * 255	SQ_ALU_SRC_PS: previous scalar result.
2856	 */
2857	for (i = 0; i < TGSI_FILE_COUNT; i++) {
2858		ctx.file_offset[i] = 0;
2859	}
2860
2861#ifdef R600_USE_LLVM
2862	if (use_llvm && ctx.info.indirect_files && (ctx.info.indirect_files & (1 << TGSI_FILE_CONSTANT)) != ctx.info.indirect_files) {
2863		fprintf(stderr, "Warning: R600 LLVM backend does not support "
2864				"indirect adressing.  Falling back to TGSI "
2865				"backend.\n");
2866		use_llvm = 0;
2867	}
2868#endif
2869	if (ctx.type == TGSI_PROCESSOR_VERTEX) {
2870		ctx.file_offset[TGSI_FILE_INPUT] = 1;
2871		if (!use_llvm) {
2872			r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
2873		}
2874	}
2875	if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
2876		if (ctx.bc->chip_class >= EVERGREEN)
2877			ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
2878		else
2879			ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
2880	}
2881	if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2882		/* FIXME 1 would be enough in some cases (3 or less input vertices) */
2883		ctx.file_offset[TGSI_FILE_INPUT] = 2;
2884	}
2885	ctx.use_llvm = use_llvm;
2886
2887	if (use_llvm) {
2888		ctx.file_offset[TGSI_FILE_OUTPUT] =
2889			ctx.file_offset[TGSI_FILE_INPUT];
2890	} else {
2891	   ctx.file_offset[TGSI_FILE_OUTPUT] =
2892			ctx.file_offset[TGSI_FILE_INPUT] +
2893			ctx.info.file_max[TGSI_FILE_INPUT] + 1;
2894	}
2895	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
2896						ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
2897
2898	/* Outside the GPR range. This will be translated to one of the
2899	 * kcache banks later. */
2900	ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
2901
2902	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
2903	ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
2904			ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
2905	ctx.bc->index_reg[0] = ctx.bc->ar_reg + 1;
2906	ctx.bc->index_reg[1] = ctx.bc->ar_reg + 2;
2907
2908	if (ctx.type == TGSI_PROCESSOR_TESS_CTRL) {
2909		ctx.tess_input_info = ctx.bc->ar_reg + 3;
2910		ctx.tess_output_info = ctx.bc->ar_reg + 4;
2911		ctx.temp_reg = ctx.bc->ar_reg + 5;
2912	} else if (ctx.type == TGSI_PROCESSOR_TESS_EVAL) {
2913		ctx.tess_input_info = 0;
2914		ctx.tess_output_info = ctx.bc->ar_reg + 3;
2915		ctx.temp_reg = ctx.bc->ar_reg + 4;
2916	} else if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2917		ctx.gs_export_gpr_tregs[0] = ctx.bc->ar_reg + 3;
2918		ctx.gs_export_gpr_tregs[1] = ctx.bc->ar_reg + 4;
2919		ctx.gs_export_gpr_tregs[2] = ctx.bc->ar_reg + 5;
2920		ctx.gs_export_gpr_tregs[3] = ctx.bc->ar_reg + 6;
2921		ctx.temp_reg = ctx.bc->ar_reg + 7;
2922	} else {
2923		ctx.temp_reg = ctx.bc->ar_reg + 3;
2924	}
2925
2926	shader->max_arrays = 0;
2927	shader->num_arrays = 0;
2928	if (indirect_gprs) {
2929
2930		if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
2931			r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
2932			                   ctx.file_offset[TGSI_FILE_OUTPUT] -
2933			                   ctx.file_offset[TGSI_FILE_INPUT],
2934			                   0x0F);
2935		}
2936		if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
2937			r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
2938			                   ctx.file_offset[TGSI_FILE_TEMPORARY] -
2939			                   ctx.file_offset[TGSI_FILE_OUTPUT],
2940			                   0x0F);
2941		}
2942	}
2943
2944	ctx.nliterals = 0;
2945	ctx.literals = NULL;
2946
2947	shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS];
2948	shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
2949	shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
2950
2951	if (shader->vs_as_gs_a)
2952		vs_add_primid_output(&ctx, key.vs.prim_id_out);
2953
2954	if (ctx.type == TGSI_PROCESSOR_TESS_EVAL)
2955		r600_fetch_tess_io_info(&ctx);
2956
2957	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
2958		tgsi_parse_token(&ctx.parse);
2959		switch (ctx.parse.FullToken.Token.Type) {
2960		case TGSI_TOKEN_TYPE_IMMEDIATE:
2961			immediate = &ctx.parse.FullToken.FullImmediate;
2962			ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
2963			if(ctx.literals == NULL) {
2964				r = -ENOMEM;
2965				goto out_err;
2966			}
2967			ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
2968			ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
2969			ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
2970			ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
2971			ctx.nliterals++;
2972			break;
2973		case TGSI_TOKEN_TYPE_DECLARATION:
2974			r = tgsi_declaration(&ctx);
2975			if (r)
2976				goto out_err;
2977			break;
2978		case TGSI_TOKEN_TYPE_INSTRUCTION:
2979		case TGSI_TOKEN_TYPE_PROPERTY:
2980			break;
2981		default:
2982			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
2983			r = -EINVAL;
2984			goto out_err;
2985		}
2986	}
2987
2988	shader->ring_item_sizes[0] = ctx.next_ring_offset;
2989	shader->ring_item_sizes[1] = 0;
2990	shader->ring_item_sizes[2] = 0;
2991	shader->ring_item_sizes[3] = 0;
2992
2993	/* Process two side if needed */
2994	if (shader->two_side && ctx.colors_used) {
2995		int i, count = ctx.shader->ninput;
2996		unsigned next_lds_loc = ctx.shader->nlds;
2997
2998		/* additional inputs will be allocated right after the existing inputs,
2999		 * we won't need them after the color selection, so we don't need to
3000		 * reserve these gprs for the rest of the shader code and to adjust
3001		 * output offsets etc. */
3002		int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
3003				ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3004
3005		/* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
3006		if (ctx.face_gpr == -1) {
3007			i = ctx.shader->ninput++;
3008			ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
3009			ctx.shader->input[i].spi_sid = 0;
3010			ctx.shader->input[i].gpr = gpr++;
3011			ctx.face_gpr = ctx.shader->input[i].gpr;
3012		}
3013
3014		for (i = 0; i < count; i++) {
3015			if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
3016				int ni = ctx.shader->ninput++;
3017				memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
3018				ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
3019				ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
3020				ctx.shader->input[ni].gpr = gpr++;
3021				// TGSI to LLVM needs to know the lds position of inputs.
3022				// Non LLVM path computes it later (in process_twoside_color)
3023				ctx.shader->input[ni].lds_pos = next_lds_loc++;
3024				ctx.shader->input[i].back_color_input = ni;
3025				if (ctx.bc->chip_class >= EVERGREEN) {
3026					if ((r = evergreen_interp_input(&ctx, ni)))
3027						return r;
3028				}
3029			}
3030		}
3031	}
3032
3033/* LLVM backend setup */
3034#ifdef R600_USE_LLVM
3035	if (use_llvm) {
3036		struct radeon_llvm_context radeon_llvm_ctx;
3037		LLVMModuleRef mod;
3038		bool dump = r600_can_dump_shader(&rscreen->b, tokens);
3039		boolean use_kill = false;
3040
3041		memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
3042		radeon_llvm_ctx.type = ctx.type;
3043		radeon_llvm_ctx.two_side = shader->two_side;
3044		radeon_llvm_ctx.face_gpr = ctx.face_gpr;
3045		radeon_llvm_ctx.inputs_count = ctx.shader->ninput + 1;
3046		radeon_llvm_ctx.r600_inputs = ctx.shader->input;
3047		radeon_llvm_ctx.r600_outputs = ctx.shader->output;
3048		radeon_llvm_ctx.color_buffer_count = max_color_exports;
3049		radeon_llvm_ctx.chip_class = ctx.bc->chip_class;
3050		radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN);
3051		radeon_llvm_ctx.stream_outputs = &so;
3052		radeon_llvm_ctx.alpha_to_one = key.ps.alpha_to_one;
3053		radeon_llvm_ctx.has_compressed_msaa_texturing =
3054			ctx.bc->has_compressed_msaa_texturing;
3055		mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
3056		ctx.shader->has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp;
3057		ctx.shader->uses_tex_buffers = radeon_llvm_ctx.uses_tex_buffers;
3058
3059		if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill, dump)) {
3060			radeon_llvm_dispose(&radeon_llvm_ctx);
3061			use_llvm = 0;
3062			fprintf(stderr, "R600 LLVM backend failed to compile "
3063				"shader.  Falling back to TGSI\n");
3064		} else {
3065			ctx.file_offset[TGSI_FILE_OUTPUT] =
3066					ctx.file_offset[TGSI_FILE_INPUT];
3067		}
3068		if (use_kill)
3069			ctx.shader->uses_kill = use_kill;
3070		radeon_llvm_dispose(&radeon_llvm_ctx);
3071	}
3072#endif
3073/* End of LLVM backend setup */
3074
3075	if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
3076		shader->nr_ps_max_color_exports = 8;
3077
3078	if (!use_llvm) {
3079		if (ctx.fragcoord_input >= 0) {
3080			if (ctx.bc->chip_class == CAYMAN) {
3081				for (j = 0 ; j < 4; j++) {
3082					struct r600_bytecode_alu alu;
3083					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3084					alu.op = ALU_OP1_RECIP_IEEE;
3085					alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3086					alu.src[0].chan = 3;
3087
3088					alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3089					alu.dst.chan = j;
3090					alu.dst.write = (j == 3);
3091					alu.last = 1;
3092					if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3093						return r;
3094				}
3095			} else {
3096				struct r600_bytecode_alu alu;
3097				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3098				alu.op = ALU_OP1_RECIP_IEEE;
3099				alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3100				alu.src[0].chan = 3;
3101
3102				alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3103				alu.dst.chan = 3;
3104				alu.dst.write = 1;
3105				alu.last = 1;
3106				if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3107					return r;
3108			}
3109		}
3110
3111		if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
3112			struct r600_bytecode_alu alu;
3113			int r;
3114
3115			/* GS thread with no output workaround - emit a cut at start of GS */
3116			if (ctx.bc->chip_class == R600)
3117				r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
3118
3119			for (j = 0; j < 4; j++) {
3120				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3121				alu.op = ALU_OP1_MOV;
3122				alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3123				alu.src[0].value = 0;
3124				alu.dst.sel = ctx.gs_export_gpr_tregs[j];
3125				alu.dst.write = 1;
3126				alu.last = 1;
3127				r = r600_bytecode_add_alu(ctx.bc, &alu);
3128				if (r)
3129					return r;
3130			}
3131		}
3132
3133		if (ctx.type == TGSI_PROCESSOR_TESS_CTRL)
3134			r600_fetch_tess_io_info(&ctx);
3135
3136		if (shader->two_side && ctx.colors_used) {
3137			if ((r = process_twoside_color_inputs(&ctx)))
3138				return r;
3139		}
3140
3141		tgsi_parse_init(&ctx.parse, tokens);
3142		while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3143			tgsi_parse_token(&ctx.parse);
3144			switch (ctx.parse.FullToken.Token.Type) {
3145			case TGSI_TOKEN_TYPE_INSTRUCTION:
3146				r = tgsi_is_supported(&ctx);
3147				if (r)
3148					goto out_err;
3149				ctx.max_driver_temp_used = 0;
3150				/* reserve first tmp for everyone */
3151				r600_get_temp(&ctx);
3152
3153				opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
3154				if ((r = tgsi_split_constant(&ctx)))
3155					goto out_err;
3156				if ((r = tgsi_split_literal_constant(&ctx)))
3157					goto out_err;
3158				if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
3159					if ((r = tgsi_split_gs_inputs(&ctx)))
3160						goto out_err;
3161				} else if (lds_inputs) {
3162					if ((r = tgsi_split_lds_inputs(&ctx)))
3163						goto out_err;
3164				}
3165				if (ctx.bc->chip_class == CAYMAN)
3166					ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
3167				else if (ctx.bc->chip_class >= EVERGREEN)
3168					ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
3169				else
3170					ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
3171				r = ctx.inst_info->process(&ctx);
3172				if (r)
3173					goto out_err;
3174
3175				if (ctx.type == TGSI_PROCESSOR_TESS_CTRL) {
3176					r = r600_store_tcs_output(&ctx);
3177					if (r)
3178						goto out_err;
3179				}
3180				break;
3181			default:
3182				break;
3183			}
3184		}
3185	}
3186
3187	/* Reset the temporary register counter. */
3188	ctx.max_driver_temp_used = 0;
3189
3190	noutput = shader->noutput;
3191
3192	if (!ring_outputs && ctx.clip_vertex_write) {
3193		unsigned clipdist_temp[2];
3194
3195		clipdist_temp[0] = r600_get_temp(&ctx);
3196		clipdist_temp[1] = r600_get_temp(&ctx);
3197
3198		/* need to convert a clipvertex write into clipdistance writes and not export
3199		   the clip vertex anymore */
3200
3201		memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
3202		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
3203		shader->output[noutput].gpr = clipdist_temp[0];
3204		noutput++;
3205		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
3206		shader->output[noutput].gpr = clipdist_temp[1];
3207		noutput++;
3208
3209		/* reset spi_sid for clipvertex output to avoid confusing spi */
3210		shader->output[ctx.cv_output].spi_sid = 0;
3211
3212		shader->clip_dist_write = 0xFF;
3213
3214		for (i = 0; i < 8; i++) {
3215			int oreg = i >> 2;
3216			int ochan = i & 3;
3217
3218			for (j = 0; j < 4; j++) {
3219				struct r600_bytecode_alu alu;
3220				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3221				alu.op = ALU_OP2_DOT4;
3222				alu.src[0].sel = shader->output[ctx.cv_output].gpr;
3223				alu.src[0].chan = j;
3224
3225				alu.src[1].sel = 512 + i;
3226				alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
3227				alu.src[1].chan = j;
3228
3229				alu.dst.sel = clipdist_temp[oreg];
3230				alu.dst.chan = j;
3231				alu.dst.write = (j == ochan);
3232				if (j == 3)
3233					alu.last = 1;
3234				if (!use_llvm)
3235					r = r600_bytecode_add_alu(ctx.bc, &alu);
3236				if (r)
3237					return r;
3238			}
3239		}
3240	}
3241
3242	/* Add stream outputs. */
3243	if (!use_llvm && so.num_outputs) {
3244		bool emit = false;
3245		if (!lds_outputs && !ring_outputs && ctx.type == TGSI_PROCESSOR_VERTEX)
3246			emit = true;
3247		if (!ring_outputs && ctx.type == TGSI_PROCESSOR_TESS_EVAL)
3248			emit = true;
3249		if (emit)
3250			emit_streamout(&ctx, &so, -1, NULL);
3251	}
3252	pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
3253	convert_edgeflag_to_int(&ctx);
3254
3255	if (ctx.type == TGSI_PROCESSOR_TESS_CTRL)
3256		r600_emit_tess_factor(&ctx);
3257
3258	if (lds_outputs) {
3259		if (ctx.type == TGSI_PROCESSOR_VERTEX) {
3260			if (ctx.shader->noutput)
3261				emit_lds_vs_writes(&ctx);
3262		}
3263	} else if (ring_outputs) {
3264		if (shader->vs_as_es || shader->tes_as_es) {
3265			ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
3266			ctx.gs_export_gpr_tregs[1] = -1;
3267			ctx.gs_export_gpr_tregs[2] = -1;
3268			ctx.gs_export_gpr_tregs[3] = -1;
3269
3270			emit_gs_ring_writes(&ctx, &so, -1, FALSE);
3271		}
3272	} else {
3273		/* Export output */
3274		next_clip_base = shader->vs_out_misc_write ? 62 : 61;
3275
3276		for (i = 0, j = 0; i < noutput; i++, j++) {
3277			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3278			output[j].gpr = shader->output[i].gpr;
3279			output[j].elem_size = 3;
3280			output[j].swizzle_x = 0;
3281			output[j].swizzle_y = 1;
3282			output[j].swizzle_z = 2;
3283			output[j].swizzle_w = 3;
3284			output[j].burst_count = 1;
3285			output[j].type = -1;
3286			output[j].op = CF_OP_EXPORT;
3287			switch (ctx.type) {
3288			case TGSI_PROCESSOR_VERTEX:
3289			case TGSI_PROCESSOR_TESS_EVAL:
3290				switch (shader->output[i].name) {
3291				case TGSI_SEMANTIC_POSITION:
3292					output[j].array_base = 60;
3293					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3294					pos_emitted = true;
3295					break;
3296
3297				case TGSI_SEMANTIC_PSIZE:
3298					output[j].array_base = 61;
3299					output[j].swizzle_y = 7;
3300					output[j].swizzle_z = 7;
3301					output[j].swizzle_w = 7;
3302					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3303					pos_emitted = true;
3304					break;
3305				case TGSI_SEMANTIC_EDGEFLAG:
3306					output[j].array_base = 61;
3307					output[j].swizzle_x = 7;
3308					output[j].swizzle_y = 0;
3309					output[j].swizzle_z = 7;
3310					output[j].swizzle_w = 7;
3311					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3312					pos_emitted = true;
3313					break;
3314				case TGSI_SEMANTIC_LAYER:
3315					/* spi_sid is 0 for outputs that are
3316					 * not consumed by PS */
3317					if (shader->output[i].spi_sid) {
3318						output[j].array_base = next_param_base++;
3319						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3320						j++;
3321						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3322					}
3323					output[j].array_base = 61;
3324					output[j].swizzle_x = 7;
3325					output[j].swizzle_y = 7;
3326					output[j].swizzle_z = 0;
3327					output[j].swizzle_w = 7;
3328					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3329					pos_emitted = true;
3330					break;
3331				case TGSI_SEMANTIC_VIEWPORT_INDEX:
3332					/* spi_sid is 0 for outputs that are
3333					 * not consumed by PS */
3334					if (shader->output[i].spi_sid) {
3335						output[j].array_base = next_param_base++;
3336						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3337						j++;
3338						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3339					}
3340					output[j].array_base = 61;
3341					output[j].swizzle_x = 7;
3342					output[j].swizzle_y = 7;
3343					output[j].swizzle_z = 7;
3344					output[j].swizzle_w = 0;
3345					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3346					pos_emitted = true;
3347					break;
3348				case TGSI_SEMANTIC_CLIPVERTEX:
3349					j--;
3350					break;
3351				case TGSI_SEMANTIC_CLIPDIST:
3352					output[j].array_base = next_clip_base++;
3353					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3354					pos_emitted = true;
3355					/* spi_sid is 0 for clipdistance outputs that were generated
3356					 * for clipvertex - we don't need to pass them to PS */
3357					if (shader->output[i].spi_sid) {
3358						j++;
3359						/* duplicate it as PARAM to pass to the pixel shader */
3360						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3361						output[j].array_base = next_param_base++;
3362						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3363					}
3364					break;
3365				case TGSI_SEMANTIC_FOG:
3366					output[j].swizzle_y = 4; /* 0 */
3367					output[j].swizzle_z = 4; /* 0 */
3368					output[j].swizzle_w = 5; /* 1 */
3369					break;
3370				case TGSI_SEMANTIC_PRIMID:
3371					output[j].swizzle_x = 2;
3372					output[j].swizzle_y = 4; /* 0 */
3373					output[j].swizzle_z = 4; /* 0 */
3374					output[j].swizzle_w = 4; /* 0 */
3375					break;
3376				}
3377
3378				break;
3379			case TGSI_PROCESSOR_FRAGMENT:
3380				if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
3381					/* never export more colors than the number of CBs */
3382					if (shader->output[i].sid >= max_color_exports) {
3383						/* skip export */
3384						j--;
3385						continue;
3386					}
3387					output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
3388					output[j].array_base = shader->output[i].sid;
3389					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3390					shader->nr_ps_color_exports++;
3391					if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
3392						for (k = 1; k < max_color_exports; k++) {
3393							j++;
3394							memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3395							output[j].gpr = shader->output[i].gpr;
3396							output[j].elem_size = 3;
3397							output[j].swizzle_x = 0;
3398							output[j].swizzle_y = 1;
3399							output[j].swizzle_z = 2;
3400							output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
3401							output[j].burst_count = 1;
3402							output[j].array_base = k;
3403							output[j].op = CF_OP_EXPORT;
3404							output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3405							shader->nr_ps_color_exports++;
3406						}
3407					}
3408				} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
3409					output[j].array_base = 61;
3410					output[j].swizzle_x = 2;
3411					output[j].swizzle_y = 7;
3412					output[j].swizzle_z = output[j].swizzle_w = 7;
3413					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3414				} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
3415					output[j].array_base = 61;
3416					output[j].swizzle_x = 7;
3417					output[j].swizzle_y = 1;
3418					output[j].swizzle_z = output[j].swizzle_w = 7;
3419					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3420				} else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
3421					output[j].array_base = 61;
3422					output[j].swizzle_x = 7;
3423					output[j].swizzle_y = 7;
3424					output[j].swizzle_z = 0;
3425					output[j].swizzle_w = 7;
3426					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3427				} else {
3428					R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
3429					r = -EINVAL;
3430					goto out_err;
3431				}
3432				break;
3433			case TGSI_PROCESSOR_TESS_CTRL:
3434				break;
3435			default:
3436				R600_ERR("unsupported processor type %d\n", ctx.type);
3437				r = -EINVAL;
3438				goto out_err;
3439			}
3440
3441			if (output[j].type==-1) {
3442				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3443				output[j].array_base = next_param_base++;
3444			}
3445		}
3446
3447		/* add fake position export */
3448		if ((ctx.type == TGSI_PROCESSOR_VERTEX || ctx.type == TGSI_PROCESSOR_TESS_EVAL) && pos_emitted == false) {
3449			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3450			output[j].gpr = 0;
3451			output[j].elem_size = 3;
3452			output[j].swizzle_x = 7;
3453			output[j].swizzle_y = 7;
3454			output[j].swizzle_z = 7;
3455			output[j].swizzle_w = 7;
3456			output[j].burst_count = 1;
3457			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3458			output[j].array_base = 60;
3459			output[j].op = CF_OP_EXPORT;
3460			j++;
3461		}
3462
3463		/* add fake param output for vertex shader if no param is exported */
3464		if ((ctx.type == TGSI_PROCESSOR_VERTEX || ctx.type == TGSI_PROCESSOR_TESS_EVAL) && next_param_base == 0) {
3465			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3466			output[j].gpr = 0;
3467			output[j].elem_size = 3;
3468			output[j].swizzle_x = 7;
3469			output[j].swizzle_y = 7;
3470			output[j].swizzle_z = 7;
3471			output[j].swizzle_w = 7;
3472			output[j].burst_count = 1;
3473			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3474			output[j].array_base = 0;
3475			output[j].op = CF_OP_EXPORT;
3476			j++;
3477		}
3478
3479		/* add fake pixel export */
3480		if (ctx.type == TGSI_PROCESSOR_FRAGMENT && shader->nr_ps_color_exports == 0) {
3481			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3482			output[j].gpr = 0;
3483			output[j].elem_size = 3;
3484			output[j].swizzle_x = 7;
3485			output[j].swizzle_y = 7;
3486			output[j].swizzle_z = 7;
3487			output[j].swizzle_w = 7;
3488			output[j].burst_count = 1;
3489			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3490			output[j].array_base = 0;
3491			output[j].op = CF_OP_EXPORT;
3492			j++;
3493			shader->nr_ps_color_exports++;
3494		}
3495
3496		noutput = j;
3497
3498		/* set export done on last export of each type */
3499		for (i = noutput - 1, output_done = 0; i >= 0; i--) {
3500			if (!(output_done & (1 << output[i].type))) {
3501				output_done |= (1 << output[i].type);
3502				output[i].op = CF_OP_EXPORT_DONE;
3503			}
3504		}
3505		/* add output to bytecode */
3506		if (!use_llvm) {
3507			for (i = 0; i < noutput; i++) {
3508				r = r600_bytecode_add_output(ctx.bc, &output[i]);
3509				if (r)
3510					goto out_err;
3511			}
3512		}
3513	}
3514
3515	/* add program end */
3516	if (!use_llvm) {
3517		if (ctx.bc->chip_class == CAYMAN)
3518			cm_bytecode_add_cf_end(ctx.bc);
3519		else {
3520			const struct cf_op_info *last = NULL;
3521
3522			if (ctx.bc->cf_last)
3523				last = r600_isa_cf(ctx.bc->cf_last->op);
3524
3525			/* alu clause instructions don't have EOP bit, so add NOP */
3526			if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_CALL_FS || ctx.bc->cf_last->op == CF_OP_POP || ctx.bc->cf_last->op == CF_OP_GDS)
3527				r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
3528
3529			ctx.bc->cf_last->end_of_program = 1;
3530		}
3531	}
3532
3533	/* check GPR limit - we have 124 = 128 - 4
3534	 * (4 are reserved as alu clause temporary registers) */
3535	if (ctx.bc->ngpr > 124) {
3536		R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
3537		r = -ENOMEM;
3538		goto out_err;
3539	}
3540
3541	if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
3542		if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
3543			return r;
3544	}
3545
3546	free(ctx.literals);
3547	tgsi_parse_free(&ctx.parse);
3548	return 0;
3549out_err:
3550	free(ctx.literals);
3551	tgsi_parse_free(&ctx.parse);
3552	return r;
3553}
3554
3555static int tgsi_unsupported(struct r600_shader_ctx *ctx)
3556{
3557	const unsigned tgsi_opcode =
3558		ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
3559	R600_ERR("%s tgsi opcode unsupported\n",
3560		 tgsi_get_opcode_name(tgsi_opcode));
3561	return -EINVAL;
3562}
3563
3564static int tgsi_end(struct r600_shader_ctx *ctx)
3565{
3566	return 0;
3567}
3568
3569static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
3570			const struct r600_shader_src *shader_src,
3571			unsigned chan)
3572{
3573	bc_src->sel = shader_src->sel;
3574	bc_src->chan = shader_src->swizzle[chan];
3575	bc_src->neg = shader_src->neg;
3576	bc_src->abs = shader_src->abs;
3577	bc_src->rel = shader_src->rel;
3578	bc_src->value = shader_src->value[bc_src->chan];
3579	bc_src->kc_bank = shader_src->kc_bank;
3580	bc_src->kc_rel = shader_src->kc_rel;
3581}
3582
3583static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
3584{
3585	bc_src->abs = 1;
3586	bc_src->neg = 0;
3587}
3588
3589static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
3590{
3591	bc_src->neg = !bc_src->neg;
3592}
3593
3594static void tgsi_dst(struct r600_shader_ctx *ctx,
3595		     const struct tgsi_full_dst_register *tgsi_dst,
3596		     unsigned swizzle,
3597		     struct r600_bytecode_alu_dst *r600_dst)
3598{
3599	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3600
3601	r600_dst->sel = tgsi_dst->Register.Index;
3602	r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
3603	r600_dst->chan = swizzle;
3604	r600_dst->write = 1;
3605	if (inst->Instruction.Saturate) {
3606		r600_dst->clamp = 1;
3607	}
3608	if (ctx->type == TGSI_PROCESSOR_TESS_CTRL) {
3609		if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) {
3610			return;
3611		}
3612	}
3613	if (tgsi_dst->Register.Indirect)
3614		r600_dst->rel = V_SQ_REL_RELATIVE;
3615
3616}
3617
3618static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap)
3619{
3620	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3621	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3622	struct r600_bytecode_alu alu;
3623	int i, j, r, lasti = tgsi_last_instruction(write_mask);
3624	int use_tmp = 0;
3625
3626	if (singledest) {
3627		switch (write_mask) {
3628		case 0x1:
3629			write_mask = 0x3;
3630			break;
3631		case 0x2:
3632			use_tmp = 1;
3633			write_mask = 0x3;
3634			break;
3635		case 0x4:
3636			write_mask = 0xc;
3637			break;
3638		case 0x8:
3639			write_mask = 0xc;
3640			use_tmp = 3;
3641			break;
3642		}
3643	}
3644
3645	lasti = tgsi_last_instruction(write_mask);
3646	for (i = 0; i <= lasti; i++) {
3647
3648		if (!(write_mask & (1 << i)))
3649			continue;
3650
3651		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3652
3653		if (singledest) {
3654			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3655			if (use_tmp) {
3656				alu.dst.sel = ctx->temp_reg;
3657				alu.dst.chan = i;
3658				alu.dst.write = 1;
3659			}
3660			if (i == 1 || i == 3)
3661				alu.dst.write = 0;
3662		} else
3663			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3664
3665		alu.op = ctx->inst_info->op;
3666		if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
3667			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3668		} else if (!swap) {
3669			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3670				r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
3671			}
3672		} else {
3673			r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
3674			r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
3675		}
3676
3677		/* handle some special cases */
3678		if (i == 1 || i == 3) {
3679			switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
3680			case TGSI_OPCODE_SUB:
3681				r600_bytecode_src_toggle_neg(&alu.src[1]);
3682				break;
3683			case TGSI_OPCODE_DABS:
3684				r600_bytecode_src_set_abs(&alu.src[0]);
3685				break;
3686			default:
3687				break;
3688			}
3689		}
3690		if (i == lasti) {
3691			alu.last = 1;
3692		}
3693		r = r600_bytecode_add_alu(ctx->bc, &alu);
3694		if (r)
3695			return r;
3696	}
3697
3698	if (use_tmp) {
3699		write_mask = inst->Dst[0].Register.WriteMask;
3700
3701		/* move result from temp to dst */
3702		for (i = 0; i <= lasti; i++) {
3703			if (!(write_mask & (1 << i)))
3704				continue;
3705
3706			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3707			alu.op = ALU_OP1_MOV;
3708			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3709			alu.src[0].sel = ctx->temp_reg;
3710			alu.src[0].chan = use_tmp - 1;
3711			alu.last = (i == lasti);
3712
3713			r = r600_bytecode_add_alu(ctx->bc, &alu);
3714			if (r)
3715				return r;
3716		}
3717	}
3718	return 0;
3719}
3720
3721static int tgsi_op2_64(struct r600_shader_ctx *ctx)
3722{
3723	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3724	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3725	/* confirm writemasking */
3726	if ((write_mask & 0x3) != 0x3 &&
3727	    (write_mask & 0xc) != 0xc) {
3728		fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
3729		return -1;
3730	}
3731	return tgsi_op2_64_params(ctx, false, false);
3732}
3733
3734static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
3735{
3736	return tgsi_op2_64_params(ctx, true, false);
3737}
3738
3739static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
3740{
3741	return tgsi_op2_64_params(ctx, true, true);
3742}
3743
3744static int tgsi_op3_64(struct r600_shader_ctx *ctx)
3745{
3746	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3747	struct r600_bytecode_alu alu;
3748	int i, j, r;
3749	int lasti = 3;
3750	int tmp = r600_get_temp(ctx);
3751
3752	for (i = 0; i < lasti + 1; i++) {
3753
3754		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3755		alu.op = ctx->inst_info->op;
3756		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3757			r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
3758		}
3759
3760		if (inst->Dst[0].Register.WriteMask & (1 << i))
3761			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3762		else
3763			alu.dst.sel = tmp;
3764
3765		alu.dst.chan = i;
3766		alu.is_op3 = 1;
3767		if (i == lasti) {
3768			alu.last = 1;
3769		}
3770		r = r600_bytecode_add_alu(ctx->bc, &alu);
3771		if (r)
3772			return r;
3773	}
3774	return 0;
3775}
3776
3777static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
3778{
3779	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3780	struct r600_bytecode_alu alu;
3781	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3782	int i, j, r, lasti = tgsi_last_instruction(write_mask);
3783	/* use temp register if trans_only and more than one dst component */
3784	int use_tmp = trans_only && (write_mask ^ (1 << lasti));
3785
3786	for (i = 0; i <= lasti; i++) {
3787		if (!(write_mask & (1 << i)))
3788			continue;
3789
3790		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3791		if (use_tmp) {
3792			alu.dst.sel = ctx->temp_reg;
3793			alu.dst.chan = i;
3794			alu.dst.write = 1;
3795		} else
3796			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3797
3798		alu.op = ctx->inst_info->op;
3799		if (!swap) {
3800			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3801				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
3802			}
3803		} else {
3804			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3805			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3806		}
3807		/* handle some special cases */
3808		switch (inst->Instruction.Opcode) {
3809		case TGSI_OPCODE_SUB:
3810			r600_bytecode_src_toggle_neg(&alu.src[1]);
3811			break;
3812		case TGSI_OPCODE_ABS:
3813			r600_bytecode_src_set_abs(&alu.src[0]);
3814			break;
3815		default:
3816			break;
3817		}
3818		if (i == lasti || trans_only) {
3819			alu.last = 1;
3820		}
3821		r = r600_bytecode_add_alu(ctx->bc, &alu);
3822		if (r)
3823			return r;
3824	}
3825
3826	if (use_tmp) {
3827		/* move result from temp to dst */
3828		for (i = 0; i <= lasti; i++) {
3829			if (!(write_mask & (1 << i)))
3830				continue;
3831
3832			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3833			alu.op = ALU_OP1_MOV;
3834			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3835			alu.src[0].sel = ctx->temp_reg;
3836			alu.src[0].chan = i;
3837			alu.last = (i == lasti);
3838
3839			r = r600_bytecode_add_alu(ctx->bc, &alu);
3840			if (r)
3841				return r;
3842		}
3843	}
3844	return 0;
3845}
3846
3847static int tgsi_op2(struct r600_shader_ctx *ctx)
3848{
3849	return tgsi_op2_s(ctx, 0, 0);
3850}
3851
3852static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
3853{
3854	return tgsi_op2_s(ctx, 1, 0);
3855}
3856
3857static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
3858{
3859	return tgsi_op2_s(ctx, 0, 1);
3860}
3861
3862static int tgsi_ineg(struct r600_shader_ctx *ctx)
3863{
3864	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3865	struct r600_bytecode_alu alu;
3866	int i, r;
3867	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3868
3869	for (i = 0; i < lasti + 1; i++) {
3870
3871		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3872			continue;
3873		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3874		alu.op = ctx->inst_info->op;
3875
3876		alu.src[0].sel = V_SQ_ALU_SRC_0;
3877
3878		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3879
3880		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3881
3882		if (i == lasti) {
3883			alu.last = 1;
3884		}
3885		r = r600_bytecode_add_alu(ctx->bc, &alu);
3886		if (r)
3887			return r;
3888	}
3889	return 0;
3890
3891}
3892
3893static int tgsi_dneg(struct r600_shader_ctx *ctx)
3894{
3895	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3896	struct r600_bytecode_alu alu;
3897	int i, r;
3898	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3899
3900	for (i = 0; i < lasti + 1; i++) {
3901
3902		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3903			continue;
3904		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3905		alu.op = ALU_OP1_MOV;
3906
3907		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3908
3909		if (i == 1 || i == 3)
3910			r600_bytecode_src_toggle_neg(&alu.src[0]);
3911		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3912
3913		if (i == lasti) {
3914			alu.last = 1;
3915		}
3916		r = r600_bytecode_add_alu(ctx->bc, &alu);
3917		if (r)
3918			return r;
3919	}
3920	return 0;
3921
3922}
3923
3924static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
3925{
3926	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3927	struct r600_bytecode_alu alu;
3928	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3929	int i, j, r;
3930	int firsti = write_mask == 0xc ? 2 : 0;
3931
3932	for (i = 0; i <= 3; i++) {
3933		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3934		alu.op = ctx->inst_info->op;
3935
3936		alu.dst.sel = ctx->temp_reg;
3937		alu.dst.chan = i;
3938		alu.dst.write = 1;
3939		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3940			r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
3941		}
3942
3943		if (i == 3)
3944			alu.last = 1;
3945
3946		r = r600_bytecode_add_alu(ctx->bc, &alu);
3947		if (r)
3948			return r;
3949	}
3950
3951	/* MOV first two channels to writemask dst0 */
3952	for (i = 0; i <= 1; i++) {
3953		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3954		alu.op = ALU_OP1_MOV;
3955		alu.src[0].chan = i + 2;
3956		alu.src[0].sel = ctx->temp_reg;
3957
3958		tgsi_dst(ctx, &inst->Dst[0], firsti + i, &alu.dst);
3959		alu.dst.write = (inst->Dst[0].Register.WriteMask >> (firsti + i)) & 1;
3960		alu.last = 1;
3961		r = r600_bytecode_add_alu(ctx->bc, &alu);
3962		if (r)
3963			return r;
3964	}
3965
3966	for (i = 0; i <= 3; i++) {
3967		if (inst->Dst[1].Register.WriteMask & (1 << i)) {
3968			/* MOV third channels to writemask dst1 */
3969			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3970			alu.op = ALU_OP1_MOV;
3971			alu.src[0].chan = 1;
3972			alu.src[0].sel = ctx->temp_reg;
3973
3974			tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
3975			alu.last = 1;
3976			r = r600_bytecode_add_alu(ctx->bc, &alu);
3977			if (r)
3978				return r;
3979			break;
3980		}
3981	}
3982	return 0;
3983}
3984
3985
3986static int egcm_int_to_double(struct r600_shader_ctx *ctx)
3987{
3988	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3989	struct r600_bytecode_alu alu;
3990	int i, r;
3991	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3992
3993	assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
3994		inst->Instruction.Opcode == TGSI_OPCODE_U2D);
3995
3996	for (i = 0; i <= (lasti+1)/2; i++) {
3997		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3998		alu.op = ctx->inst_info->op;
3999
4000		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4001		alu.dst.sel = ctx->temp_reg;
4002		alu.dst.chan = i;
4003		alu.dst.write = 1;
4004		alu.last = 1;
4005
4006		r = r600_bytecode_add_alu(ctx->bc, &alu);
4007		if (r)
4008			return r;
4009	}
4010
4011	for (i = 0; i <= lasti; i++) {
4012		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4013		alu.op = ALU_OP1_FLT32_TO_FLT64;
4014
4015		alu.src[0].chan = i/2;
4016		if (i%2 == 0)
4017			alu.src[0].sel = ctx->temp_reg;
4018		else {
4019			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
4020			alu.src[0].value = 0x0;
4021		}
4022		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4023		alu.last = i == lasti;
4024
4025		r = r600_bytecode_add_alu(ctx->bc, &alu);
4026		if (r)
4027			return r;
4028	}
4029
4030	return 0;
4031}
4032
4033static int egcm_double_to_int(struct r600_shader_ctx *ctx)
4034{
4035	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4036	struct r600_bytecode_alu alu;
4037	int i, r;
4038	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4039
4040	assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I ||
4041		inst->Instruction.Opcode == TGSI_OPCODE_D2U);
4042
4043	for (i = 0; i <= lasti; i++) {
4044		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4045		alu.op = ALU_OP1_FLT64_TO_FLT32;
4046
4047		r600_bytecode_src(&alu.src[0], &ctx->src[0], fp64_switch(i));
4048		alu.dst.chan = i;
4049		alu.dst.sel = ctx->temp_reg;
4050		alu.dst.write = i%2 == 0;
4051		alu.last = i == lasti;
4052
4053		r = r600_bytecode_add_alu(ctx->bc, &alu);
4054		if (r)
4055			return r;
4056	}
4057
4058	for (i = 0; i <= (lasti+1)/2; i++) {
4059		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4060		alu.op = ctx->inst_info->op;
4061
4062		alu.src[0].chan = i*2;
4063		alu.src[0].sel = ctx->temp_reg;
4064		tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
4065		alu.last = 1;
4066
4067		r = r600_bytecode_add_alu(ctx->bc, &alu);
4068		if (r)
4069			return r;
4070	}
4071
4072	return 0;
4073}
4074
4075static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
4076{
4077	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4078	int i, r;
4079	struct r600_bytecode_alu alu;
4080	int last_slot = 3;
4081	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4082	int t1 = ctx->temp_reg;
4083
4084	/* these have to write the result to X/Y by the looks of it */
4085	for (i = 0 ; i < last_slot; i++) {
4086		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4087		alu.op = ctx->inst_info->op;
4088
4089		/* should only be one src regs */
4090		assert (inst->Instruction.NumSrcRegs == 1);
4091
4092		r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
4093		r600_bytecode_src(&alu.src[1], &ctx->src[0], 0);
4094
4095		/* RSQ should take the absolute value of src */
4096		if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
4097		    ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT) {
4098			r600_bytecode_src_set_abs(&alu.src[1]);
4099		}
4100		alu.dst.sel = t1;
4101		alu.dst.chan = i;
4102		alu.dst.write = (i == 0 || i == 1);
4103
4104		if (ctx->bc->chip_class != CAYMAN || i == last_slot - 1)
4105			alu.last = 1;
4106		r = r600_bytecode_add_alu(ctx->bc, &alu);
4107		if (r)
4108			return r;
4109	}
4110
4111	for (i = 0 ; i <= lasti; i++) {
4112		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4113			continue;
4114		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4115		alu.op = ALU_OP1_MOV;
4116		alu.src[0].sel = t1;
4117		alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
4118		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4119		alu.dst.write = 1;
4120		if (i == lasti)
4121			alu.last = 1;
4122		r = r600_bytecode_add_alu(ctx->bc, &alu);
4123		if (r)
4124			return r;
4125	}
4126	return 0;
4127}
4128
4129static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
4130{
4131	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4132	int i, j, r;
4133	struct r600_bytecode_alu alu;
4134	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
4135
4136	for (i = 0 ; i < last_slot; i++) {
4137		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4138		alu.op = ctx->inst_info->op;
4139		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4140			r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
4141
4142			/* RSQ should take the absolute value of src */
4143			if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
4144				r600_bytecode_src_set_abs(&alu.src[j]);
4145			}
4146		}
4147		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4148		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4149
4150		if (i == last_slot - 1)
4151			alu.last = 1;
4152		r = r600_bytecode_add_alu(ctx->bc, &alu);
4153		if (r)
4154			return r;
4155	}
4156	return 0;
4157}
4158
4159static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
4160{
4161	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4162	int i, j, k, r;
4163	struct r600_bytecode_alu alu;
4164	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4165	int t1 = ctx->temp_reg;
4166
4167	for (k = 0; k <= lasti; k++) {
4168		if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
4169			continue;
4170
4171		for (i = 0 ; i < 4; i++) {
4172			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4173			alu.op = ctx->inst_info->op;
4174			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4175				r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
4176			}
4177			alu.dst.sel = t1;
4178			alu.dst.chan = i;
4179			alu.dst.write = (i == k);
4180			if (i == 3)
4181				alu.last = 1;
4182			r = r600_bytecode_add_alu(ctx->bc, &alu);
4183			if (r)
4184				return r;
4185		}
4186	}
4187
4188	for (i = 0 ; i <= lasti; i++) {
4189		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4190			continue;
4191		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4192		alu.op = ALU_OP1_MOV;
4193		alu.src[0].sel = t1;
4194		alu.src[0].chan = i;
4195		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4196		alu.dst.write = 1;
4197		if (i == lasti)
4198			alu.last = 1;
4199		r = r600_bytecode_add_alu(ctx->bc, &alu);
4200		if (r)
4201			return r;
4202	}
4203
4204	return 0;
4205}
4206
4207
4208static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
4209{
4210	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4211	int i, j, k, r;
4212	struct r600_bytecode_alu alu;
4213	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4214	int t1 = ctx->temp_reg;
4215
4216	for (k = 0; k < 2; k++) {
4217		if (!(inst->Dst[0].Register.WriteMask & (0x3 << (k * 2))))
4218			continue;
4219
4220		for (i = 0; i < 4; i++) {
4221			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4222			alu.op = ctx->inst_info->op;
4223			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4224				r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));;
4225			}
4226			alu.dst.sel = t1;
4227			alu.dst.chan = i;
4228			alu.dst.write = 1;
4229			if (i == 3)
4230				alu.last = 1;
4231			r = r600_bytecode_add_alu(ctx->bc, &alu);
4232			if (r)
4233				return r;
4234		}
4235	}
4236
4237	for (i = 0; i <= lasti; i++) {
4238		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4239			continue;
4240		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4241		alu.op = ALU_OP1_MOV;
4242		alu.src[0].sel = t1;
4243		alu.src[0].chan = i;
4244		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4245		alu.dst.write = 1;
4246		if (i == lasti)
4247			alu.last = 1;
4248		r = r600_bytecode_add_alu(ctx->bc, &alu);
4249		if (r)
4250			return r;
4251	}
4252
4253	return 0;
4254}
4255
4256/*
4257 * r600 - trunc to -PI..PI range
4258 * r700 - normalize by dividing by 2PI
4259 * see fdo bug 27901
4260 */
4261static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
4262{
4263	static float half_inv_pi = 1.0 /(3.1415926535 * 2);
4264	static float double_pi = 3.1415926535 * 2;
4265	static float neg_pi = -3.1415926535;
4266
4267	int r;
4268	struct r600_bytecode_alu alu;
4269
4270	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4271	alu.op = ALU_OP3_MULADD;
4272	alu.is_op3 = 1;
4273
4274	alu.dst.chan = 0;
4275	alu.dst.sel = ctx->temp_reg;
4276	alu.dst.write = 1;
4277
4278	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4279
4280	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4281	alu.src[1].chan = 0;
4282	alu.src[1].value = *(uint32_t *)&half_inv_pi;
4283	alu.src[2].sel = V_SQ_ALU_SRC_0_5;
4284	alu.src[2].chan = 0;
4285	alu.last = 1;
4286	r = r600_bytecode_add_alu(ctx->bc, &alu);
4287	if (r)
4288		return r;
4289
4290	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4291	alu.op = ALU_OP1_FRACT;
4292
4293	alu.dst.chan = 0;
4294	alu.dst.sel = ctx->temp_reg;
4295	alu.dst.write = 1;
4296
4297	alu.src[0].sel = ctx->temp_reg;
4298	alu.src[0].chan = 0;
4299	alu.last = 1;
4300	r = r600_bytecode_add_alu(ctx->bc, &alu);
4301	if (r)
4302		return r;
4303
4304	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4305	alu.op = ALU_OP3_MULADD;
4306	alu.is_op3 = 1;
4307
4308	alu.dst.chan = 0;
4309	alu.dst.sel = ctx->temp_reg;
4310	alu.dst.write = 1;
4311
4312	alu.src[0].sel = ctx->temp_reg;
4313	alu.src[0].chan = 0;
4314
4315	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4316	alu.src[1].chan = 0;
4317	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
4318	alu.src[2].chan = 0;
4319
4320	if (ctx->bc->chip_class == R600) {
4321		alu.src[1].value = *(uint32_t *)&double_pi;
4322		alu.src[2].value = *(uint32_t *)&neg_pi;
4323	} else {
4324		alu.src[1].sel = V_SQ_ALU_SRC_1;
4325		alu.src[2].sel = V_SQ_ALU_SRC_0_5;
4326		alu.src[2].neg = 1;
4327	}
4328
4329	alu.last = 1;
4330	r = r600_bytecode_add_alu(ctx->bc, &alu);
4331	if (r)
4332		return r;
4333	return 0;
4334}
4335
4336static int cayman_trig(struct r600_shader_ctx *ctx)
4337{
4338	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4339	struct r600_bytecode_alu alu;
4340	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
4341	int i, r;
4342
4343	r = tgsi_setup_trig(ctx);
4344	if (r)
4345		return r;
4346
4347
4348	for (i = 0; i < last_slot; i++) {
4349		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4350		alu.op = ctx->inst_info->op;
4351		alu.dst.chan = i;
4352
4353		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4354		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4355
4356		alu.src[0].sel = ctx->temp_reg;
4357		alu.src[0].chan = 0;
4358		if (i == last_slot - 1)
4359			alu.last = 1;
4360		r = r600_bytecode_add_alu(ctx->bc, &alu);
4361		if (r)
4362			return r;
4363	}
4364	return 0;
4365}
4366
4367static int tgsi_trig(struct r600_shader_ctx *ctx)
4368{
4369	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4370	struct r600_bytecode_alu alu;
4371	int i, r;
4372	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4373
4374	r = tgsi_setup_trig(ctx);
4375	if (r)
4376		return r;
4377
4378	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4379	alu.op = ctx->inst_info->op;
4380	alu.dst.chan = 0;
4381	alu.dst.sel = ctx->temp_reg;
4382	alu.dst.write = 1;
4383
4384	alu.src[0].sel = ctx->temp_reg;
4385	alu.src[0].chan = 0;
4386	alu.last = 1;
4387	r = r600_bytecode_add_alu(ctx->bc, &alu);
4388	if (r)
4389		return r;
4390
4391	/* replicate result */
4392	for (i = 0; i < lasti + 1; i++) {
4393		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4394			continue;
4395
4396		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4397		alu.op = ALU_OP1_MOV;
4398
4399		alu.src[0].sel = ctx->temp_reg;
4400		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4401		if (i == lasti)
4402			alu.last = 1;
4403		r = r600_bytecode_add_alu(ctx->bc, &alu);
4404		if (r)
4405			return r;
4406	}
4407	return 0;
4408}
4409
4410static int tgsi_scs(struct r600_shader_ctx *ctx)
4411{
4412	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4413	struct r600_bytecode_alu alu;
4414	int i, r;
4415
4416	/* We'll only need the trig stuff if we are going to write to the
4417	 * X or Y components of the destination vector.
4418	 */
4419	if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) {
4420		r = tgsi_setup_trig(ctx);
4421		if (r)
4422			return r;
4423	}
4424
4425	/* dst.x = COS */
4426	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
4427		if (ctx->bc->chip_class == CAYMAN) {
4428			for (i = 0 ; i < 3; i++) {
4429				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4430				alu.op = ALU_OP1_COS;
4431				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4432
4433				if (i == 0)
4434					alu.dst.write = 1;
4435				else
4436					alu.dst.write = 0;
4437				alu.src[0].sel = ctx->temp_reg;
4438				alu.src[0].chan = 0;
4439				if (i == 2)
4440					alu.last = 1;
4441				r = r600_bytecode_add_alu(ctx->bc, &alu);
4442				if (r)
4443					return r;
4444			}
4445		} else {
4446			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4447			alu.op = ALU_OP1_COS;
4448			tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
4449
4450			alu.src[0].sel = ctx->temp_reg;
4451			alu.src[0].chan = 0;
4452			alu.last = 1;
4453			r = r600_bytecode_add_alu(ctx->bc, &alu);
4454			if (r)
4455				return r;
4456		}
4457	}
4458
4459	/* dst.y = SIN */
4460	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
4461		if (ctx->bc->chip_class == CAYMAN) {
4462			for (i = 0 ; i < 3; i++) {
4463				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4464				alu.op = ALU_OP1_SIN;
4465				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4466				if (i == 1)
4467					alu.dst.write = 1;
4468				else
4469					alu.dst.write = 0;
4470				alu.src[0].sel = ctx->temp_reg;
4471				alu.src[0].chan = 0;
4472				if (i == 2)
4473					alu.last = 1;
4474				r = r600_bytecode_add_alu(ctx->bc, &alu);
4475				if (r)
4476					return r;
4477			}
4478		} else {
4479			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4480			alu.op = ALU_OP1_SIN;
4481			tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
4482
4483			alu.src[0].sel = ctx->temp_reg;
4484			alu.src[0].chan = 0;
4485			alu.last = 1;
4486			r = r600_bytecode_add_alu(ctx->bc, &alu);
4487			if (r)
4488				return r;
4489		}
4490	}
4491
4492	/* dst.z = 0.0; */
4493	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
4494		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4495
4496		alu.op = ALU_OP1_MOV;
4497
4498		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
4499
4500		alu.src[0].sel = V_SQ_ALU_SRC_0;
4501		alu.src[0].chan = 0;
4502
4503		alu.last = 1;
4504
4505		r = r600_bytecode_add_alu(ctx->bc, &alu);
4506		if (r)
4507			return r;
4508	}
4509
4510	/* dst.w = 1.0; */
4511	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
4512		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4513
4514		alu.op = ALU_OP1_MOV;
4515
4516		tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
4517
4518		alu.src[0].sel = V_SQ_ALU_SRC_1;
4519		alu.src[0].chan = 0;
4520
4521		alu.last = 1;
4522
4523		r = r600_bytecode_add_alu(ctx->bc, &alu);
4524		if (r)
4525			return r;
4526	}
4527
4528	return 0;
4529}
4530
4531static int tgsi_kill(struct r600_shader_ctx *ctx)
4532{
4533	const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4534	struct r600_bytecode_alu alu;
4535	int i, r;
4536
4537	for (i = 0; i < 4; i++) {
4538		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4539		alu.op = ctx->inst_info->op;
4540
4541		alu.dst.chan = i;
4542
4543		alu.src[0].sel = V_SQ_ALU_SRC_0;
4544
4545		if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
4546			alu.src[1].sel = V_SQ_ALU_SRC_1;
4547			alu.src[1].neg = 1;
4548		} else {
4549			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4550		}
4551		if (i == 3) {
4552			alu.last = 1;
4553		}
4554		r = r600_bytecode_add_alu(ctx->bc, &alu);
4555		if (r)
4556			return r;
4557	}
4558
4559	/* kill must be last in ALU */
4560	ctx->bc->force_add_cf = 1;
4561	ctx->shader->uses_kill = TRUE;
4562	return 0;
4563}
4564
4565static int tgsi_lit(struct r600_shader_ctx *ctx)
4566{
4567	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4568	struct r600_bytecode_alu alu;
4569	int r;
4570
4571	/* tmp.x = max(src.y, 0.0) */
4572	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4573	alu.op = ALU_OP2_MAX;
4574	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
4575	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
4576	alu.src[1].chan = 1;
4577
4578	alu.dst.sel = ctx->temp_reg;
4579	alu.dst.chan = 0;
4580	alu.dst.write = 1;
4581
4582	alu.last = 1;
4583	r = r600_bytecode_add_alu(ctx->bc, &alu);
4584	if (r)
4585		return r;
4586
4587	if (inst->Dst[0].Register.WriteMask & (1 << 2))
4588	{
4589		int chan;
4590		int sel;
4591		int i;
4592
4593		if (ctx->bc->chip_class == CAYMAN) {
4594			for (i = 0; i < 3; i++) {
4595				/* tmp.z = log(tmp.x) */
4596				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4597				alu.op = ALU_OP1_LOG_CLAMPED;
4598				alu.src[0].sel = ctx->temp_reg;
4599				alu.src[0].chan = 0;
4600				alu.dst.sel = ctx->temp_reg;
4601				alu.dst.chan = i;
4602				if (i == 2) {
4603					alu.dst.write = 1;
4604					alu.last = 1;
4605				} else
4606					alu.dst.write = 0;
4607
4608				r = r600_bytecode_add_alu(ctx->bc, &alu);
4609				if (r)
4610					return r;
4611			}
4612		} else {
4613			/* tmp.z = log(tmp.x) */
4614			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4615			alu.op = ALU_OP1_LOG_CLAMPED;
4616			alu.src[0].sel = ctx->temp_reg;
4617			alu.src[0].chan = 0;
4618			alu.dst.sel = ctx->temp_reg;
4619			alu.dst.chan = 2;
4620			alu.dst.write = 1;
4621			alu.last = 1;
4622			r = r600_bytecode_add_alu(ctx->bc, &alu);
4623			if (r)
4624				return r;
4625		}
4626
4627		chan = alu.dst.chan;
4628		sel = alu.dst.sel;
4629
4630		/* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
4631		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4632		alu.op = ALU_OP3_MUL_LIT;
4633		alu.src[0].sel  = sel;
4634		alu.src[0].chan = chan;
4635		r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
4636		r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
4637		alu.dst.sel = ctx->temp_reg;
4638		alu.dst.chan = 0;
4639		alu.dst.write = 1;
4640		alu.is_op3 = 1;
4641		alu.last = 1;
4642		r = r600_bytecode_add_alu(ctx->bc, &alu);
4643		if (r)
4644			return r;
4645
4646		if (ctx->bc->chip_class == CAYMAN) {
4647			for (i = 0; i < 3; i++) {
4648				/* dst.z = exp(tmp.x) */
4649				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4650				alu.op = ALU_OP1_EXP_IEEE;
4651				alu.src[0].sel = ctx->temp_reg;
4652				alu.src[0].chan = 0;
4653				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4654				if (i == 2) {
4655					alu.dst.write = 1;
4656					alu.last = 1;
4657				} else
4658					alu.dst.write = 0;
4659				r = r600_bytecode_add_alu(ctx->bc, &alu);
4660				if (r)
4661					return r;
4662			}
4663		} else {
4664			/* dst.z = exp(tmp.x) */
4665			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4666			alu.op = ALU_OP1_EXP_IEEE;
4667			alu.src[0].sel = ctx->temp_reg;
4668			alu.src[0].chan = 0;
4669			tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
4670			alu.last = 1;
4671			r = r600_bytecode_add_alu(ctx->bc, &alu);
4672			if (r)
4673				return r;
4674		}
4675	}
4676
4677	/* dst.x, <- 1.0  */
4678	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4679	alu.op = ALU_OP1_MOV;
4680	alu.src[0].sel  = V_SQ_ALU_SRC_1; /*1.0*/
4681	alu.src[0].chan = 0;
4682	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
4683	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
4684	r = r600_bytecode_add_alu(ctx->bc, &alu);
4685	if (r)
4686		return r;
4687
4688	/* dst.y = max(src.x, 0.0) */
4689	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4690	alu.op = ALU_OP2_MAX;
4691	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4692	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
4693	alu.src[1].chan = 0;
4694	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
4695	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
4696	r = r600_bytecode_add_alu(ctx->bc, &alu);
4697	if (r)
4698		return r;
4699
4700	/* dst.w, <- 1.0  */
4701	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4702	alu.op = ALU_OP1_MOV;
4703	alu.src[0].sel  = V_SQ_ALU_SRC_1;
4704	alu.src[0].chan = 0;
4705	tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
4706	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
4707	alu.last = 1;
4708	r = r600_bytecode_add_alu(ctx->bc, &alu);
4709	if (r)
4710		return r;
4711
4712	return 0;
4713}
4714
4715static int tgsi_rsq(struct r600_shader_ctx *ctx)
4716{
4717	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4718	struct r600_bytecode_alu alu;
4719	int i, r;
4720
4721	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4722
4723	/* XXX:
4724	 * For state trackers other than OpenGL, we'll want to use
4725	 * _RECIPSQRT_IEEE instead.
4726	 */
4727	alu.op = ALU_OP1_RECIPSQRT_CLAMPED;
4728
4729	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
4730		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
4731		r600_bytecode_src_set_abs(&alu.src[i]);
4732	}
4733	alu.dst.sel = ctx->temp_reg;
4734	alu.dst.write = 1;
4735	alu.last = 1;
4736	r = r600_bytecode_add_alu(ctx->bc, &alu);
4737	if (r)
4738		return r;
4739	/* replicate result */
4740	return tgsi_helper_tempx_replicate(ctx);
4741}
4742
4743static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
4744{
4745	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4746	struct r600_bytecode_alu alu;
4747	int i, r;
4748
4749	for (i = 0; i < 4; i++) {
4750		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4751		alu.src[0].sel = ctx->temp_reg;
4752		alu.op = ALU_OP1_MOV;
4753		alu.dst.chan = i;
4754		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4755		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4756		if (i == 3)
4757			alu.last = 1;
4758		r = r600_bytecode_add_alu(ctx->bc, &alu);
4759		if (r)
4760			return r;
4761	}
4762	return 0;
4763}
4764
4765static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
4766{
4767	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4768	struct r600_bytecode_alu alu;
4769	int i, r;
4770
4771	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4772	alu.op = ctx->inst_info->op;
4773	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
4774		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
4775	}
4776	alu.dst.sel = ctx->temp_reg;
4777	alu.dst.write = 1;
4778	alu.last = 1;
4779	r = r600_bytecode_add_alu(ctx->bc, &alu);
4780	if (r)
4781		return r;
4782	/* replicate result */
4783	return tgsi_helper_tempx_replicate(ctx);
4784}
4785
4786static int cayman_pow(struct r600_shader_ctx *ctx)
4787{
4788	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4789	int i, r;
4790	struct r600_bytecode_alu alu;
4791	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
4792
4793	for (i = 0; i < 3; i++) {
4794		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4795		alu.op = ALU_OP1_LOG_IEEE;
4796		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4797		alu.dst.sel = ctx->temp_reg;
4798		alu.dst.chan = i;
4799		alu.dst.write = 1;
4800		if (i == 2)
4801			alu.last = 1;
4802		r = r600_bytecode_add_alu(ctx->bc, &alu);
4803		if (r)
4804			return r;
4805	}
4806
4807	/* b * LOG2(a) */
4808	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4809	alu.op = ALU_OP2_MUL;
4810	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
4811	alu.src[1].sel = ctx->temp_reg;
4812	alu.dst.sel = ctx->temp_reg;
4813	alu.dst.write = 1;
4814	alu.last = 1;
4815	r = r600_bytecode_add_alu(ctx->bc, &alu);
4816	if (r)
4817		return r;
4818
4819	for (i = 0; i < last_slot; i++) {
4820		/* POW(a,b) = EXP2(b * LOG2(a))*/
4821		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4822		alu.op = ALU_OP1_EXP_IEEE;
4823		alu.src[0].sel = ctx->temp_reg;
4824
4825		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4826		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4827		if (i == last_slot - 1)
4828			alu.last = 1;
4829		r = r600_bytecode_add_alu(ctx->bc, &alu);
4830		if (r)
4831			return r;
4832	}
4833	return 0;
4834}
4835
4836static int tgsi_pow(struct r600_shader_ctx *ctx)
4837{
4838	struct r600_bytecode_alu alu;
4839	int r;
4840
4841	/* LOG2(a) */
4842	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4843	alu.op = ALU_OP1_LOG_IEEE;
4844	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4845	alu.dst.sel = ctx->temp_reg;
4846	alu.dst.write = 1;
4847	alu.last = 1;
4848	r = r600_bytecode_add_alu(ctx->bc, &alu);
4849	if (r)
4850		return r;
4851	/* b * LOG2(a) */
4852	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4853	alu.op = ALU_OP2_MUL;
4854	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
4855	alu.src[1].sel = ctx->temp_reg;
4856	alu.dst.sel = ctx->temp_reg;
4857	alu.dst.write = 1;
4858	alu.last = 1;
4859	r = r600_bytecode_add_alu(ctx->bc, &alu);
4860	if (r)
4861		return r;
4862	/* POW(a,b) = EXP2(b * LOG2(a))*/
4863	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4864	alu.op = ALU_OP1_EXP_IEEE;
4865	alu.src[0].sel = ctx->temp_reg;
4866	alu.dst.sel = ctx->temp_reg;
4867	alu.dst.write = 1;
4868	alu.last = 1;
4869	r = r600_bytecode_add_alu(ctx->bc, &alu);
4870	if (r)
4871		return r;
4872	return tgsi_helper_tempx_replicate(ctx);
4873}
4874
4875static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
4876{
4877	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4878	struct r600_bytecode_alu alu;
4879	int i, r, j;
4880	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4881	int tmp0 = ctx->temp_reg;
4882	int tmp1 = r600_get_temp(ctx);
4883	int tmp2 = r600_get_temp(ctx);
4884	int tmp3 = r600_get_temp(ctx);
4885	/* Unsigned path:
4886	 *
4887	 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
4888	 *
4889	 * 1. tmp0.x = rcp (src2)     = 2^32/src2 + e, where e is rounding error
4890	 * 2. tmp0.z = lo (tmp0.x * src2)
4891	 * 3. tmp0.w = -tmp0.z
4892	 * 4. tmp0.y = hi (tmp0.x * src2)
4893	 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src2))
4894	 * 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error
4895	 * 7. tmp1.x = tmp0.x - tmp0.w
4896	 * 8. tmp1.y = tmp0.x + tmp0.w
4897	 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
4898	 * 10. tmp0.z = hi(tmp0.x * src1)     = q
4899	 * 11. tmp0.y = lo (tmp0.z * src2)     = src2*q = src1 - r
4900	 *
4901	 * 12. tmp0.w = src1 - tmp0.y       = r
4902	 * 13. tmp1.x = tmp0.w >= src2		= r >= src2 (uint comparison)
4903	 * 14. tmp1.y = src1 >= tmp0.y      = r >= 0 (uint comparison)
4904	 *
4905	 * if DIV
4906	 *
4907	 *   15. tmp1.z = tmp0.z + 1			= q + 1
4908	 *   16. tmp1.w = tmp0.z - 1			= q - 1
4909	 *
4910	 * else MOD
4911	 *
4912	 *   15. tmp1.z = tmp0.w - src2			= r - src2
4913	 *   16. tmp1.w = tmp0.w + src2			= r + src2
4914	 *
4915	 * endif
4916	 *
4917	 * 17. tmp1.x = tmp1.x & tmp1.y
4918	 *
4919	 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
4920	 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
4921	 *
4922	 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
4923	 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
4924	 *
4925	 * Signed path:
4926	 *
4927	 * Same as unsigned, using abs values of the operands,
4928	 * and fixing the sign of the result in the end.
4929	 */
4930
4931	for (i = 0; i < 4; i++) {
4932		if (!(write_mask & (1<<i)))
4933			continue;
4934
4935		if (signed_op) {
4936
4937			/* tmp2.x = -src0 */
4938			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4939			alu.op = ALU_OP2_SUB_INT;
4940
4941			alu.dst.sel = tmp2;
4942			alu.dst.chan = 0;
4943			alu.dst.write = 1;
4944
4945			alu.src[0].sel = V_SQ_ALU_SRC_0;
4946
4947			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4948
4949			alu.last = 1;
4950			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4951				return r;
4952
4953			/* tmp2.y = -src1 */
4954			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4955			alu.op = ALU_OP2_SUB_INT;
4956
4957			alu.dst.sel = tmp2;
4958			alu.dst.chan = 1;
4959			alu.dst.write = 1;
4960
4961			alu.src[0].sel = V_SQ_ALU_SRC_0;
4962
4963			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4964
4965			alu.last = 1;
4966			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4967				return r;
4968
4969			/* tmp2.z sign bit is set if src0 and src2 signs are different */
4970			/* it will be a sign of the quotient */
4971			if (!mod) {
4972
4973				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4974				alu.op = ALU_OP2_XOR_INT;
4975
4976				alu.dst.sel = tmp2;
4977				alu.dst.chan = 2;
4978				alu.dst.write = 1;
4979
4980				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4981				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4982
4983				alu.last = 1;
4984				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4985					return r;
4986			}
4987
4988			/* tmp2.x = |src0| */
4989			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4990			alu.op = ALU_OP3_CNDGE_INT;
4991			alu.is_op3 = 1;
4992
4993			alu.dst.sel = tmp2;
4994			alu.dst.chan = 0;
4995			alu.dst.write = 1;
4996
4997			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4998			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4999			alu.src[2].sel = tmp2;
5000			alu.src[2].chan = 0;
5001
5002			alu.last = 1;
5003			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5004				return r;
5005
5006			/* tmp2.y = |src1| */
5007			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5008			alu.op = ALU_OP3_CNDGE_INT;
5009			alu.is_op3 = 1;
5010
5011			alu.dst.sel = tmp2;
5012			alu.dst.chan = 1;
5013			alu.dst.write = 1;
5014
5015			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5016			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5017			alu.src[2].sel = tmp2;
5018			alu.src[2].chan = 1;
5019
5020			alu.last = 1;
5021			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5022				return r;
5023
5024		}
5025
5026		/* 1. tmp0.x = rcp_u (src2)     = 2^32/src2 + e, where e is rounding error */
5027		if (ctx->bc->chip_class == CAYMAN) {
5028			/* tmp3.x = u2f(src2) */
5029			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5030			alu.op = ALU_OP1_UINT_TO_FLT;
5031
5032			alu.dst.sel = tmp3;
5033			alu.dst.chan = 0;
5034			alu.dst.write = 1;
5035
5036			if (signed_op) {
5037				alu.src[0].sel = tmp2;
5038				alu.src[0].chan = 1;
5039			} else {
5040				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5041			}
5042
5043			alu.last = 1;
5044			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5045				return r;
5046
5047			/* tmp0.x = recip(tmp3.x) */
5048			for (j = 0 ; j < 3; j++) {
5049				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5050				alu.op = ALU_OP1_RECIP_IEEE;
5051
5052				alu.dst.sel = tmp0;
5053				alu.dst.chan = j;
5054				alu.dst.write = (j == 0);
5055
5056				alu.src[0].sel = tmp3;
5057				alu.src[0].chan = 0;
5058
5059				if (j == 2)
5060					alu.last = 1;
5061				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5062					return r;
5063			}
5064
5065			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5066			alu.op = ALU_OP2_MUL;
5067
5068			alu.src[0].sel = tmp0;
5069			alu.src[0].chan = 0;
5070
5071			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5072			alu.src[1].value = 0x4f800000;
5073
5074			alu.dst.sel = tmp3;
5075			alu.dst.write = 1;
5076			alu.last = 1;
5077			r = r600_bytecode_add_alu(ctx->bc, &alu);
5078			if (r)
5079				return r;
5080
5081			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5082			alu.op = ALU_OP1_FLT_TO_UINT;
5083
5084			alu.dst.sel = tmp0;
5085			alu.dst.chan = 0;
5086			alu.dst.write = 1;
5087
5088			alu.src[0].sel = tmp3;
5089			alu.src[0].chan = 0;
5090
5091			alu.last = 1;
5092			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5093				return r;
5094
5095		} else {
5096			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5097			alu.op = ALU_OP1_RECIP_UINT;
5098
5099			alu.dst.sel = tmp0;
5100			alu.dst.chan = 0;
5101			alu.dst.write = 1;
5102
5103			if (signed_op) {
5104				alu.src[0].sel = tmp2;
5105				alu.src[0].chan = 1;
5106			} else {
5107				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5108			}
5109
5110			alu.last = 1;
5111			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5112				return r;
5113		}
5114
5115		/* 2. tmp0.z = lo (tmp0.x * src2) */
5116		if (ctx->bc->chip_class == CAYMAN) {
5117			for (j = 0 ; j < 4; j++) {
5118				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5119				alu.op = ALU_OP2_MULLO_UINT;
5120
5121				alu.dst.sel = tmp0;
5122				alu.dst.chan = j;
5123				alu.dst.write = (j == 2);
5124
5125				alu.src[0].sel = tmp0;
5126				alu.src[0].chan = 0;
5127				if (signed_op) {
5128					alu.src[1].sel = tmp2;
5129					alu.src[1].chan = 1;
5130				} else {
5131					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5132				}
5133
5134				alu.last = (j == 3);
5135				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5136					return r;
5137			}
5138		} else {
5139			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5140			alu.op = ALU_OP2_MULLO_UINT;
5141
5142			alu.dst.sel = tmp0;
5143			alu.dst.chan = 2;
5144			alu.dst.write = 1;
5145
5146			alu.src[0].sel = tmp0;
5147			alu.src[0].chan = 0;
5148			if (signed_op) {
5149				alu.src[1].sel = tmp2;
5150				alu.src[1].chan = 1;
5151			} else {
5152				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5153			}
5154
5155			alu.last = 1;
5156			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5157				return r;
5158		}
5159
5160		/* 3. tmp0.w = -tmp0.z */
5161		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5162		alu.op = ALU_OP2_SUB_INT;
5163
5164		alu.dst.sel = tmp0;
5165		alu.dst.chan = 3;
5166		alu.dst.write = 1;
5167
5168		alu.src[0].sel = V_SQ_ALU_SRC_0;
5169		alu.src[1].sel = tmp0;
5170		alu.src[1].chan = 2;
5171
5172		alu.last = 1;
5173		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5174			return r;
5175
5176		/* 4. tmp0.y = hi (tmp0.x * src2) */
5177		if (ctx->bc->chip_class == CAYMAN) {
5178			for (j = 0 ; j < 4; j++) {
5179				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5180				alu.op = ALU_OP2_MULHI_UINT;
5181
5182				alu.dst.sel = tmp0;
5183				alu.dst.chan = j;
5184				alu.dst.write = (j == 1);
5185
5186				alu.src[0].sel = tmp0;
5187				alu.src[0].chan = 0;
5188
5189				if (signed_op) {
5190					alu.src[1].sel = tmp2;
5191					alu.src[1].chan = 1;
5192				} else {
5193					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5194				}
5195				alu.last = (j == 3);
5196				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5197					return r;
5198			}
5199		} else {
5200			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5201			alu.op = ALU_OP2_MULHI_UINT;
5202
5203			alu.dst.sel = tmp0;
5204			alu.dst.chan = 1;
5205			alu.dst.write = 1;
5206
5207			alu.src[0].sel = tmp0;
5208			alu.src[0].chan = 0;
5209
5210			if (signed_op) {
5211				alu.src[1].sel = tmp2;
5212				alu.src[1].chan = 1;
5213			} else {
5214				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5215			}
5216
5217			alu.last = 1;
5218			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5219				return r;
5220		}
5221
5222		/* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src)) */
5223		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5224		alu.op = ALU_OP3_CNDE_INT;
5225		alu.is_op3 = 1;
5226
5227		alu.dst.sel = tmp0;
5228		alu.dst.chan = 2;
5229		alu.dst.write = 1;
5230
5231		alu.src[0].sel = tmp0;
5232		alu.src[0].chan = 1;
5233		alu.src[1].sel = tmp0;
5234		alu.src[1].chan = 3;
5235		alu.src[2].sel = tmp0;
5236		alu.src[2].chan = 2;
5237
5238		alu.last = 1;
5239		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5240			return r;
5241
5242		/* 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error */
5243		if (ctx->bc->chip_class == CAYMAN) {
5244			for (j = 0 ; j < 4; j++) {
5245				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5246				alu.op = ALU_OP2_MULHI_UINT;
5247
5248				alu.dst.sel = tmp0;
5249				alu.dst.chan = j;
5250				alu.dst.write = (j == 3);
5251
5252				alu.src[0].sel = tmp0;
5253				alu.src[0].chan = 2;
5254
5255				alu.src[1].sel = tmp0;
5256				alu.src[1].chan = 0;
5257
5258				alu.last = (j == 3);
5259				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5260					return r;
5261			}
5262		} else {
5263			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5264			alu.op = ALU_OP2_MULHI_UINT;
5265
5266			alu.dst.sel = tmp0;
5267			alu.dst.chan = 3;
5268			alu.dst.write = 1;
5269
5270			alu.src[0].sel = tmp0;
5271			alu.src[0].chan = 2;
5272
5273			alu.src[1].sel = tmp0;
5274			alu.src[1].chan = 0;
5275
5276			alu.last = 1;
5277			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5278				return r;
5279		}
5280
5281		/* 7. tmp1.x = tmp0.x - tmp0.w */
5282		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5283		alu.op = ALU_OP2_SUB_INT;
5284
5285		alu.dst.sel = tmp1;
5286		alu.dst.chan = 0;
5287		alu.dst.write = 1;
5288
5289		alu.src[0].sel = tmp0;
5290		alu.src[0].chan = 0;
5291		alu.src[1].sel = tmp0;
5292		alu.src[1].chan = 3;
5293
5294		alu.last = 1;
5295		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5296			return r;
5297
5298		/* 8. tmp1.y = tmp0.x + tmp0.w */
5299		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5300		alu.op = ALU_OP2_ADD_INT;
5301
5302		alu.dst.sel = tmp1;
5303		alu.dst.chan = 1;
5304		alu.dst.write = 1;
5305
5306		alu.src[0].sel = tmp0;
5307		alu.src[0].chan = 0;
5308		alu.src[1].sel = tmp0;
5309		alu.src[1].chan = 3;
5310
5311		alu.last = 1;
5312		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5313			return r;
5314
5315		/* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
5316		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5317		alu.op = ALU_OP3_CNDE_INT;
5318		alu.is_op3 = 1;
5319
5320		alu.dst.sel = tmp0;
5321		alu.dst.chan = 0;
5322		alu.dst.write = 1;
5323
5324		alu.src[0].sel = tmp0;
5325		alu.src[0].chan = 1;
5326		alu.src[1].sel = tmp1;
5327		alu.src[1].chan = 1;
5328		alu.src[2].sel = tmp1;
5329		alu.src[2].chan = 0;
5330
5331		alu.last = 1;
5332		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5333			return r;
5334
5335		/* 10. tmp0.z = hi(tmp0.x * src1)     = q */
5336		if (ctx->bc->chip_class == CAYMAN) {
5337			for (j = 0 ; j < 4; j++) {
5338				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5339				alu.op = ALU_OP2_MULHI_UINT;
5340
5341				alu.dst.sel = tmp0;
5342				alu.dst.chan = j;
5343				alu.dst.write = (j == 2);
5344
5345				alu.src[0].sel = tmp0;
5346				alu.src[0].chan = 0;
5347
5348				if (signed_op) {
5349					alu.src[1].sel = tmp2;
5350					alu.src[1].chan = 0;
5351				} else {
5352					r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5353				}
5354
5355				alu.last = (j == 3);
5356				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5357					return r;
5358			}
5359		} else {
5360			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5361			alu.op = ALU_OP2_MULHI_UINT;
5362
5363			alu.dst.sel = tmp0;
5364			alu.dst.chan = 2;
5365			alu.dst.write = 1;
5366
5367			alu.src[0].sel = tmp0;
5368			alu.src[0].chan = 0;
5369
5370			if (signed_op) {
5371				alu.src[1].sel = tmp2;
5372				alu.src[1].chan = 0;
5373			} else {
5374				r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5375			}
5376
5377			alu.last = 1;
5378			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5379				return r;
5380		}
5381
5382		/* 11. tmp0.y = lo (src2 * tmp0.z)     = src2*q = src1 - r */
5383		if (ctx->bc->chip_class == CAYMAN) {
5384			for (j = 0 ; j < 4; j++) {
5385				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5386				alu.op = ALU_OP2_MULLO_UINT;
5387
5388				alu.dst.sel = tmp0;
5389				alu.dst.chan = j;
5390				alu.dst.write = (j == 1);
5391
5392				if (signed_op) {
5393					alu.src[0].sel = tmp2;
5394					alu.src[0].chan = 1;
5395				} else {
5396					r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5397				}
5398
5399				alu.src[1].sel = tmp0;
5400				alu.src[1].chan = 2;
5401
5402				alu.last = (j == 3);
5403				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5404					return r;
5405			}
5406		} else {
5407			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5408			alu.op = ALU_OP2_MULLO_UINT;
5409
5410			alu.dst.sel = tmp0;
5411			alu.dst.chan = 1;
5412			alu.dst.write = 1;
5413
5414			if (signed_op) {
5415				alu.src[0].sel = tmp2;
5416				alu.src[0].chan = 1;
5417			} else {
5418				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5419			}
5420
5421			alu.src[1].sel = tmp0;
5422			alu.src[1].chan = 2;
5423
5424			alu.last = 1;
5425			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5426				return r;
5427		}
5428
5429		/* 12. tmp0.w = src1 - tmp0.y       = r */
5430		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5431		alu.op = ALU_OP2_SUB_INT;
5432
5433		alu.dst.sel = tmp0;
5434		alu.dst.chan = 3;
5435		alu.dst.write = 1;
5436
5437		if (signed_op) {
5438			alu.src[0].sel = tmp2;
5439			alu.src[0].chan = 0;
5440		} else {
5441			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5442		}
5443
5444		alu.src[1].sel = tmp0;
5445		alu.src[1].chan = 1;
5446
5447		alu.last = 1;
5448		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5449			return r;
5450
5451		/* 13. tmp1.x = tmp0.w >= src2		= r >= src2 */
5452		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5453		alu.op = ALU_OP2_SETGE_UINT;
5454
5455		alu.dst.sel = tmp1;
5456		alu.dst.chan = 0;
5457		alu.dst.write = 1;
5458
5459		alu.src[0].sel = tmp0;
5460		alu.src[0].chan = 3;
5461		if (signed_op) {
5462			alu.src[1].sel = tmp2;
5463			alu.src[1].chan = 1;
5464		} else {
5465			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5466		}
5467
5468		alu.last = 1;
5469		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5470			return r;
5471
5472		/* 14. tmp1.y = src1 >= tmp0.y       = r >= 0 */
5473		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5474		alu.op = ALU_OP2_SETGE_UINT;
5475
5476		alu.dst.sel = tmp1;
5477		alu.dst.chan = 1;
5478		alu.dst.write = 1;
5479
5480		if (signed_op) {
5481			alu.src[0].sel = tmp2;
5482			alu.src[0].chan = 0;
5483		} else {
5484			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5485		}
5486
5487		alu.src[1].sel = tmp0;
5488		alu.src[1].chan = 1;
5489
5490		alu.last = 1;
5491		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5492			return r;
5493
5494		if (mod) { /* UMOD */
5495
5496			/* 15. tmp1.z = tmp0.w - src2			= r - src2 */
5497			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5498			alu.op = ALU_OP2_SUB_INT;
5499
5500			alu.dst.sel = tmp1;
5501			alu.dst.chan = 2;
5502			alu.dst.write = 1;
5503
5504			alu.src[0].sel = tmp0;
5505			alu.src[0].chan = 3;
5506
5507			if (signed_op) {
5508				alu.src[1].sel = tmp2;
5509				alu.src[1].chan = 1;
5510			} else {
5511				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5512			}
5513
5514			alu.last = 1;
5515			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5516				return r;
5517
5518			/* 16. tmp1.w = tmp0.w + src2			= r + src2 */
5519			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5520			alu.op = ALU_OP2_ADD_INT;
5521
5522			alu.dst.sel = tmp1;
5523			alu.dst.chan = 3;
5524			alu.dst.write = 1;
5525
5526			alu.src[0].sel = tmp0;
5527			alu.src[0].chan = 3;
5528			if (signed_op) {
5529				alu.src[1].sel = tmp2;
5530				alu.src[1].chan = 1;
5531			} else {
5532				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5533			}
5534
5535			alu.last = 1;
5536			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5537				return r;
5538
5539		} else { /* UDIV */
5540
5541			/* 15. tmp1.z = tmp0.z + 1       = q + 1       DIV */
5542			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5543			alu.op = ALU_OP2_ADD_INT;
5544
5545			alu.dst.sel = tmp1;
5546			alu.dst.chan = 2;
5547			alu.dst.write = 1;
5548
5549			alu.src[0].sel = tmp0;
5550			alu.src[0].chan = 2;
5551			alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
5552
5553			alu.last = 1;
5554			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5555				return r;
5556
5557			/* 16. tmp1.w = tmp0.z - 1			= q - 1 */
5558			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5559			alu.op = ALU_OP2_ADD_INT;
5560
5561			alu.dst.sel = tmp1;
5562			alu.dst.chan = 3;
5563			alu.dst.write = 1;
5564
5565			alu.src[0].sel = tmp0;
5566			alu.src[0].chan = 2;
5567			alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
5568
5569			alu.last = 1;
5570			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5571				return r;
5572
5573		}
5574
5575		/* 17. tmp1.x = tmp1.x & tmp1.y */
5576		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5577		alu.op = ALU_OP2_AND_INT;
5578
5579		alu.dst.sel = tmp1;
5580		alu.dst.chan = 0;
5581		alu.dst.write = 1;
5582
5583		alu.src[0].sel = tmp1;
5584		alu.src[0].chan = 0;
5585		alu.src[1].sel = tmp1;
5586		alu.src[1].chan = 1;
5587
5588		alu.last = 1;
5589		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5590			return r;
5591
5592		/* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z    DIV */
5593		/* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z    MOD */
5594		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5595		alu.op = ALU_OP3_CNDE_INT;
5596		alu.is_op3 = 1;
5597
5598		alu.dst.sel = tmp0;
5599		alu.dst.chan = 2;
5600		alu.dst.write = 1;
5601
5602		alu.src[0].sel = tmp1;
5603		alu.src[0].chan = 0;
5604		alu.src[1].sel = tmp0;
5605		alu.src[1].chan = mod ? 3 : 2;
5606		alu.src[2].sel = tmp1;
5607		alu.src[2].chan = 2;
5608
5609		alu.last = 1;
5610		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5611			return r;
5612
5613		/* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
5614		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5615		alu.op = ALU_OP3_CNDE_INT;
5616		alu.is_op3 = 1;
5617
5618		if (signed_op) {
5619			alu.dst.sel = tmp0;
5620			alu.dst.chan = 2;
5621			alu.dst.write = 1;
5622		} else {
5623			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5624		}
5625
5626		alu.src[0].sel = tmp1;
5627		alu.src[0].chan = 1;
5628		alu.src[1].sel = tmp1;
5629		alu.src[1].chan = 3;
5630		alu.src[2].sel = tmp0;
5631		alu.src[2].chan = 2;
5632
5633		alu.last = 1;
5634		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5635			return r;
5636
5637		if (signed_op) {
5638
5639			/* fix the sign of the result */
5640
5641			if (mod) {
5642
5643				/* tmp0.x = -tmp0.z */
5644				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5645				alu.op = ALU_OP2_SUB_INT;
5646
5647				alu.dst.sel = tmp0;
5648				alu.dst.chan = 0;
5649				alu.dst.write = 1;
5650
5651				alu.src[0].sel = V_SQ_ALU_SRC_0;
5652				alu.src[1].sel = tmp0;
5653				alu.src[1].chan = 2;
5654
5655				alu.last = 1;
5656				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5657					return r;
5658
5659				/* sign of the remainder is the same as the sign of src0 */
5660				/* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
5661				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5662				alu.op = ALU_OP3_CNDGE_INT;
5663				alu.is_op3 = 1;
5664
5665				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5666
5667				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5668				alu.src[1].sel = tmp0;
5669				alu.src[1].chan = 2;
5670				alu.src[2].sel = tmp0;
5671				alu.src[2].chan = 0;
5672
5673				alu.last = 1;
5674				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5675					return r;
5676
5677			} else {
5678
5679				/* tmp0.x = -tmp0.z */
5680				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5681				alu.op = ALU_OP2_SUB_INT;
5682
5683				alu.dst.sel = tmp0;
5684				alu.dst.chan = 0;
5685				alu.dst.write = 1;
5686
5687				alu.src[0].sel = V_SQ_ALU_SRC_0;
5688				alu.src[1].sel = tmp0;
5689				alu.src[1].chan = 2;
5690
5691				alu.last = 1;
5692				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5693					return r;
5694
5695				/* fix the quotient sign (same as the sign of src0*src1) */
5696				/* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
5697				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5698				alu.op = ALU_OP3_CNDGE_INT;
5699				alu.is_op3 = 1;
5700
5701				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5702
5703				alu.src[0].sel = tmp2;
5704				alu.src[0].chan = 2;
5705				alu.src[1].sel = tmp0;
5706				alu.src[1].chan = 2;
5707				alu.src[2].sel = tmp0;
5708				alu.src[2].chan = 0;
5709
5710				alu.last = 1;
5711				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5712					return r;
5713			}
5714		}
5715	}
5716	return 0;
5717}
5718
5719static int tgsi_udiv(struct r600_shader_ctx *ctx)
5720{
5721	return tgsi_divmod(ctx, 0, 0);
5722}
5723
5724static int tgsi_umod(struct r600_shader_ctx *ctx)
5725{
5726	return tgsi_divmod(ctx, 1, 0);
5727}
5728
5729static int tgsi_idiv(struct r600_shader_ctx *ctx)
5730{
5731	return tgsi_divmod(ctx, 0, 1);
5732}
5733
5734static int tgsi_imod(struct r600_shader_ctx *ctx)
5735{
5736	return tgsi_divmod(ctx, 1, 1);
5737}
5738
5739
5740static int tgsi_f2i(struct r600_shader_ctx *ctx)
5741{
5742	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5743	struct r600_bytecode_alu alu;
5744	int i, r;
5745	unsigned write_mask = inst->Dst[0].Register.WriteMask;
5746	int last_inst = tgsi_last_instruction(write_mask);
5747
5748	for (i = 0; i < 4; i++) {
5749		if (!(write_mask & (1<<i)))
5750			continue;
5751
5752		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5753		alu.op = ALU_OP1_TRUNC;
5754
5755		alu.dst.sel = ctx->temp_reg;
5756		alu.dst.chan = i;
5757		alu.dst.write = 1;
5758
5759		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5760		if (i == last_inst)
5761			alu.last = 1;
5762		r = r600_bytecode_add_alu(ctx->bc, &alu);
5763		if (r)
5764			return r;
5765	}
5766
5767	for (i = 0; i < 4; i++) {
5768		if (!(write_mask & (1<<i)))
5769			continue;
5770
5771		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5772		alu.op = ctx->inst_info->op;
5773
5774		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5775
5776		alu.src[0].sel = ctx->temp_reg;
5777		alu.src[0].chan = i;
5778
5779		if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
5780			alu.last = 1;
5781		r = r600_bytecode_add_alu(ctx->bc, &alu);
5782		if (r)
5783			return r;
5784	}
5785
5786	return 0;
5787}
5788
5789static int tgsi_iabs(struct r600_shader_ctx *ctx)
5790{
5791	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5792	struct r600_bytecode_alu alu;
5793	int i, r;
5794	unsigned write_mask = inst->Dst[0].Register.WriteMask;
5795	int last_inst = tgsi_last_instruction(write_mask);
5796
5797	/* tmp = -src */
5798	for (i = 0; i < 4; i++) {
5799		if (!(write_mask & (1<<i)))
5800			continue;
5801
5802		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5803		alu.op = ALU_OP2_SUB_INT;
5804
5805		alu.dst.sel = ctx->temp_reg;
5806		alu.dst.chan = i;
5807		alu.dst.write = 1;
5808
5809		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5810		alu.src[0].sel = V_SQ_ALU_SRC_0;
5811
5812		if (i == last_inst)
5813			alu.last = 1;
5814		r = r600_bytecode_add_alu(ctx->bc, &alu);
5815		if (r)
5816			return r;
5817	}
5818
5819	/* dst = (src >= 0 ? src : tmp) */
5820	for (i = 0; i < 4; i++) {
5821		if (!(write_mask & (1<<i)))
5822			continue;
5823
5824		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5825		alu.op = ALU_OP3_CNDGE_INT;
5826		alu.is_op3 = 1;
5827		alu.dst.write = 1;
5828
5829		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5830
5831		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5832		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5833		alu.src[2].sel = ctx->temp_reg;
5834		alu.src[2].chan = i;
5835
5836		if (i == last_inst)
5837			alu.last = 1;
5838		r = r600_bytecode_add_alu(ctx->bc, &alu);
5839		if (r)
5840			return r;
5841	}
5842	return 0;
5843}
5844
5845static int tgsi_issg(struct r600_shader_ctx *ctx)
5846{
5847	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5848	struct r600_bytecode_alu alu;
5849	int i, r;
5850	unsigned write_mask = inst->Dst[0].Register.WriteMask;
5851	int last_inst = tgsi_last_instruction(write_mask);
5852
5853	/* tmp = (src >= 0 ? src : -1) */
5854	for (i = 0; i < 4; i++) {
5855		if (!(write_mask & (1<<i)))
5856			continue;
5857
5858		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5859		alu.op = ALU_OP3_CNDGE_INT;
5860		alu.is_op3 = 1;
5861
5862		alu.dst.sel = ctx->temp_reg;
5863		alu.dst.chan = i;
5864		alu.dst.write = 1;
5865
5866		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5867		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5868		alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
5869
5870		if (i == last_inst)
5871			alu.last = 1;
5872		r = r600_bytecode_add_alu(ctx->bc, &alu);
5873		if (r)
5874			return r;
5875	}
5876
5877	/* dst = (tmp > 0 ? 1 : tmp) */
5878	for (i = 0; i < 4; i++) {
5879		if (!(write_mask & (1<<i)))
5880			continue;
5881
5882		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5883		alu.op = ALU_OP3_CNDGT_INT;
5884		alu.is_op3 = 1;
5885		alu.dst.write = 1;
5886
5887		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5888
5889		alu.src[0].sel = ctx->temp_reg;
5890		alu.src[0].chan = i;
5891
5892		alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
5893
5894		alu.src[2].sel = ctx->temp_reg;
5895		alu.src[2].chan = i;
5896
5897		if (i == last_inst)
5898			alu.last = 1;
5899		r = r600_bytecode_add_alu(ctx->bc, &alu);
5900		if (r)
5901			return r;
5902	}
5903	return 0;
5904}
5905
5906
5907
5908static int tgsi_ssg(struct r600_shader_ctx *ctx)
5909{
5910	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5911	struct r600_bytecode_alu alu;
5912	int i, r;
5913
5914	/* tmp = (src > 0 ? 1 : src) */
5915	for (i = 0; i < 4; i++) {
5916		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5917		alu.op = ALU_OP3_CNDGT;
5918		alu.is_op3 = 1;
5919
5920		alu.dst.sel = ctx->temp_reg;
5921		alu.dst.chan = i;
5922
5923		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5924		alu.src[1].sel = V_SQ_ALU_SRC_1;
5925		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
5926
5927		if (i == 3)
5928			alu.last = 1;
5929		r = r600_bytecode_add_alu(ctx->bc, &alu);
5930		if (r)
5931			return r;
5932	}
5933
5934	/* dst = (-tmp > 0 ? -1 : tmp) */
5935	for (i = 0; i < 4; i++) {
5936		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5937		alu.op = ALU_OP3_CNDGT;
5938		alu.is_op3 = 1;
5939		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5940
5941		alu.src[0].sel = ctx->temp_reg;
5942		alu.src[0].chan = i;
5943		alu.src[0].neg = 1;
5944
5945		alu.src[1].sel = V_SQ_ALU_SRC_1;
5946		alu.src[1].neg = 1;
5947
5948		alu.src[2].sel = ctx->temp_reg;
5949		alu.src[2].chan = i;
5950
5951		if (i == 3)
5952			alu.last = 1;
5953		r = r600_bytecode_add_alu(ctx->bc, &alu);
5954		if (r)
5955			return r;
5956	}
5957	return 0;
5958}
5959
5960static int tgsi_bfi(struct r600_shader_ctx *ctx)
5961{
5962	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5963	struct r600_bytecode_alu alu;
5964	int i, r, t1, t2;
5965
5966	unsigned write_mask = inst->Dst[0].Register.WriteMask;
5967	int last_inst = tgsi_last_instruction(write_mask);
5968
5969	t1 = ctx->temp_reg;
5970
5971	for (i = 0; i < 4; i++) {
5972		if (!(write_mask & (1<<i)))
5973			continue;
5974
5975		/* create mask tmp */
5976		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5977		alu.op = ALU_OP2_BFM_INT;
5978		alu.dst.sel = t1;
5979		alu.dst.chan = i;
5980		alu.dst.write = 1;
5981		alu.last = i == last_inst;
5982
5983		r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
5984		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5985
5986		r = r600_bytecode_add_alu(ctx->bc, &alu);
5987		if (r)
5988			return r;
5989	}
5990
5991	t2 = r600_get_temp(ctx);
5992
5993	for (i = 0; i < 4; i++) {
5994		if (!(write_mask & (1<<i)))
5995			continue;
5996
5997		/* shift insert left */
5998		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5999		alu.op = ALU_OP2_LSHL_INT;
6000		alu.dst.sel = t2;
6001		alu.dst.chan = i;
6002		alu.dst.write = 1;
6003		alu.last = i == last_inst;
6004
6005		r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6006		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6007
6008		r = r600_bytecode_add_alu(ctx->bc, &alu);
6009		if (r)
6010			return r;
6011	}
6012
6013	for (i = 0; i < 4; i++) {
6014		if (!(write_mask & (1<<i)))
6015			continue;
6016
6017		/* actual bitfield insert */
6018		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6019		alu.op = ALU_OP3_BFI_INT;
6020		alu.is_op3 = 1;
6021		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6022		alu.dst.chan = i;
6023		alu.dst.write = 1;
6024		alu.last = i == last_inst;
6025
6026		alu.src[0].sel = t1;
6027		alu.src[0].chan = i;
6028		alu.src[1].sel = t2;
6029		alu.src[1].chan = i;
6030		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6031
6032		r = r600_bytecode_add_alu(ctx->bc, &alu);
6033		if (r)
6034			return r;
6035	}
6036
6037	return 0;
6038}
6039
6040static int tgsi_msb(struct r600_shader_ctx *ctx)
6041{
6042	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6043	struct r600_bytecode_alu alu;
6044	int i, r, t1, t2;
6045
6046	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6047	int last_inst = tgsi_last_instruction(write_mask);
6048
6049	assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
6050		ctx->inst_info->op == ALU_OP1_FFBH_UINT);
6051
6052	t1 = ctx->temp_reg;
6053
6054	/* bit position is indexed from lsb by TGSI, and from msb by the hardware */
6055	for (i = 0; i < 4; i++) {
6056		if (!(write_mask & (1<<i)))
6057			continue;
6058
6059		/* t1 = FFBH_INT / FFBH_UINT */
6060		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6061		alu.op = ctx->inst_info->op;
6062		alu.dst.sel = t1;
6063		alu.dst.chan = i;
6064		alu.dst.write = 1;
6065		alu.last = i == last_inst;
6066
6067		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6068
6069		r = r600_bytecode_add_alu(ctx->bc, &alu);
6070		if (r)
6071			return r;
6072	}
6073
6074	t2 = r600_get_temp(ctx);
6075
6076	for (i = 0; i < 4; i++) {
6077		if (!(write_mask & (1<<i)))
6078			continue;
6079
6080		/* t2 = 31 - t1 */
6081		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6082		alu.op = ALU_OP2_SUB_INT;
6083		alu.dst.sel = t2;
6084		alu.dst.chan = i;
6085		alu.dst.write = 1;
6086		alu.last = i == last_inst;
6087
6088		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
6089		alu.src[0].value = 31;
6090		alu.src[1].sel = t1;
6091		alu.src[1].chan = i;
6092
6093		r = r600_bytecode_add_alu(ctx->bc, &alu);
6094		if (r)
6095			return r;
6096	}
6097
6098	for (i = 0; i < 4; i++) {
6099		if (!(write_mask & (1<<i)))
6100			continue;
6101
6102		/* result = t1 >= 0 ? t2 : t1 */
6103		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6104		alu.op = ALU_OP3_CNDGE_INT;
6105		alu.is_op3 = 1;
6106		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6107		alu.dst.chan = i;
6108		alu.dst.write = 1;
6109		alu.last = i == last_inst;
6110
6111		alu.src[0].sel = t1;
6112		alu.src[0].chan = i;
6113		alu.src[1].sel = t2;
6114		alu.src[1].chan = i;
6115		alu.src[2].sel = t1;
6116		alu.src[2].chan = i;
6117
6118		r = r600_bytecode_add_alu(ctx->bc, &alu);
6119		if (r)
6120			return r;
6121	}
6122
6123	return 0;
6124}
6125
6126static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
6127{
6128	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6129	struct r600_bytecode_alu alu;
6130	int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
6131	unsigned location;
6132	int input;
6133
6134	assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
6135
6136	input = inst->Src[0].Register.Index;
6137
6138	/* Interpolators have been marked for use already by allocate_system_value_inputs */
6139	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6140		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6141		location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
6142	}
6143	else {
6144		location = TGSI_INTERPOLATE_LOC_CENTROID;
6145	}
6146
6147	k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
6148	if (k < 0)
6149		k = 0;
6150	interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
6151	interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
6152
6153	/* NOTE: currently offset is not perspective correct */
6154	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6155		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6156		int sample_gpr = -1;
6157		int gradientsH, gradientsV;
6158		struct r600_bytecode_tex tex;
6159
6160		if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6161			sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
6162		}
6163
6164		gradientsH = r600_get_temp(ctx);
6165		gradientsV = r600_get_temp(ctx);
6166		for (i = 0; i < 2; i++) {
6167			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6168			tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
6169			tex.src_gpr = interp_gpr;
6170			tex.src_sel_x = interp_base_chan + 0;
6171			tex.src_sel_y = interp_base_chan + 1;
6172			tex.src_sel_z = 0;
6173			tex.src_sel_w = 0;
6174			tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
6175			tex.dst_sel_x = 0;
6176			tex.dst_sel_y = 1;
6177			tex.dst_sel_z = 7;
6178			tex.dst_sel_w = 7;
6179			tex.inst_mod = 1; // Use per pixel gradient calculation
6180			tex.sampler_id = 0;
6181			tex.resource_id = tex.sampler_id;
6182			r = r600_bytecode_add_tex(ctx->bc, &tex);
6183			if (r)
6184				return r;
6185		}
6186
6187		for (i = 0; i < 2; i++) {
6188			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6189			alu.op = ALU_OP3_MULADD;
6190			alu.is_op3 = 1;
6191			alu.src[0].sel = gradientsH;
6192			alu.src[0].chan = i;
6193			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6194				alu.src[1].sel = sample_gpr;
6195				alu.src[1].chan = 2;
6196			}
6197			else {
6198				r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
6199			}
6200			alu.src[2].sel = interp_gpr;
6201			alu.src[2].chan = interp_base_chan + i;
6202			alu.dst.sel = ctx->temp_reg;
6203			alu.dst.chan = i;
6204			alu.last = i == 1;
6205
6206			r = r600_bytecode_add_alu(ctx->bc, &alu);
6207			if (r)
6208				return r;
6209		}
6210
6211		for (i = 0; i < 2; i++) {
6212			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6213			alu.op = ALU_OP3_MULADD;
6214			alu.is_op3 = 1;
6215			alu.src[0].sel = gradientsV;
6216			alu.src[0].chan = i;
6217			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6218				alu.src[1].sel = sample_gpr;
6219				alu.src[1].chan = 3;
6220			}
6221			else {
6222				r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
6223			}
6224			alu.src[2].sel = ctx->temp_reg;
6225			alu.src[2].chan = i;
6226			alu.dst.sel = ctx->temp_reg;
6227			alu.dst.chan = i;
6228			alu.last = i == 1;
6229
6230			r = r600_bytecode_add_alu(ctx->bc, &alu);
6231			if (r)
6232				return r;
6233		}
6234	}
6235
6236	tmp = r600_get_temp(ctx);
6237	for (i = 0; i < 8; i++) {
6238		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6239		alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
6240
6241		alu.dst.sel = tmp;
6242		if ((i > 1 && i < 6)) {
6243			alu.dst.write = 1;
6244		}
6245		else {
6246			alu.dst.write = 0;
6247		}
6248		alu.dst.chan = i % 4;
6249
6250		if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6251			inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6252			alu.src[0].sel = ctx->temp_reg;
6253			alu.src[0].chan = 1 - (i % 2);
6254		} else {
6255			alu.src[0].sel = interp_gpr;
6256			alu.src[0].chan = interp_base_chan + 1 - (i % 2);
6257		}
6258		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
6259		alu.src[1].chan = 0;
6260
6261		alu.last = i % 4 == 3;
6262		alu.bank_swizzle_force = SQ_ALU_VEC_210;
6263
6264		r = r600_bytecode_add_alu(ctx->bc, &alu);
6265		if (r)
6266			return r;
6267	}
6268
6269	// INTERP can't swizzle dst
6270	lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6271	for (i = 0; i <= lasti; i++) {
6272		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6273			continue;
6274
6275		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6276		alu.op = ALU_OP1_MOV;
6277		alu.src[0].sel = tmp;
6278		alu.src[0].chan = ctx->src[0].swizzle[i];
6279		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6280		alu.dst.write = 1;
6281		alu.last = i == lasti;
6282		r = r600_bytecode_add_alu(ctx->bc, &alu);
6283		if (r)
6284			return r;
6285	}
6286
6287	return 0;
6288}
6289
6290
6291static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
6292{
6293	struct r600_bytecode_alu alu;
6294	int i, r;
6295
6296	for (i = 0; i < 4; i++) {
6297		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6298		if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
6299			alu.op = ALU_OP0_NOP;
6300			alu.dst.chan = i;
6301		} else {
6302			alu.op = ALU_OP1_MOV;
6303			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6304			alu.src[0].sel = ctx->temp_reg;
6305			alu.src[0].chan = i;
6306		}
6307		if (i == 3) {
6308			alu.last = 1;
6309		}
6310		r = r600_bytecode_add_alu(ctx->bc, &alu);
6311		if (r)
6312			return r;
6313	}
6314	return 0;
6315}
6316
6317static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
6318                                 unsigned temp, int chan,
6319                                 struct r600_bytecode_alu_src *bc_src,
6320                                 const struct r600_shader_src *shader_src)
6321{
6322	struct r600_bytecode_alu alu;
6323	int r;
6324
6325	r600_bytecode_src(bc_src, shader_src, chan);
6326
6327	/* op3 operands don't support abs modifier */
6328	if (bc_src->abs) {
6329		assert(temp!=0);      /* we actually need the extra register, make sure it is allocated. */
6330		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6331		alu.op = ALU_OP1_MOV;
6332		alu.dst.sel = temp;
6333		alu.dst.chan = chan;
6334		alu.dst.write = 1;
6335
6336		alu.src[0] = *bc_src;
6337		alu.last = true; // sufficient?
6338		r = r600_bytecode_add_alu(ctx->bc, &alu);
6339		if (r)
6340			return r;
6341
6342		memset(bc_src, 0, sizeof(*bc_src));
6343		bc_src->sel = temp;
6344		bc_src->chan = chan;
6345	}
6346	return 0;
6347}
6348
6349static int tgsi_op3(struct r600_shader_ctx *ctx)
6350{
6351	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6352	struct r600_bytecode_alu alu;
6353	int i, j, r;
6354	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6355	int temp_regs[4];
6356
6357	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6358		temp_regs[j] = 0;
6359		if (ctx->src[j].abs)
6360			temp_regs[j] = r600_get_temp(ctx);
6361	}
6362	for (i = 0; i < lasti + 1; i++) {
6363		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6364			continue;
6365
6366		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6367		alu.op = ctx->inst_info->op;
6368		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6369			r = tgsi_make_src_for_op3(ctx, temp_regs[j], i, &alu.src[j], &ctx->src[j]);
6370			if (r)
6371				return r;
6372		}
6373
6374		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6375		alu.dst.chan = i;
6376		alu.dst.write = 1;
6377		alu.is_op3 = 1;
6378		if (i == lasti) {
6379			alu.last = 1;
6380		}
6381		r = r600_bytecode_add_alu(ctx->bc, &alu);
6382		if (r)
6383			return r;
6384	}
6385	return 0;
6386}
6387
6388static int tgsi_dp(struct r600_shader_ctx *ctx)
6389{
6390	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6391	struct r600_bytecode_alu alu;
6392	int i, j, r;
6393
6394	for (i = 0; i < 4; i++) {
6395		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6396		alu.op = ctx->inst_info->op;
6397		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6398			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
6399		}
6400
6401		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6402		alu.dst.chan = i;
6403		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
6404		/* handle some special cases */
6405		switch (inst->Instruction.Opcode) {
6406		case TGSI_OPCODE_DP2:
6407			if (i > 1) {
6408				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
6409				alu.src[0].chan = alu.src[1].chan = 0;
6410			}
6411			break;
6412		case TGSI_OPCODE_DP3:
6413			if (i > 2) {
6414				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
6415				alu.src[0].chan = alu.src[1].chan = 0;
6416			}
6417			break;
6418		case TGSI_OPCODE_DPH:
6419			if (i == 3) {
6420				alu.src[0].sel = V_SQ_ALU_SRC_1;
6421				alu.src[0].chan = 0;
6422				alu.src[0].neg = 0;
6423			}
6424			break;
6425		default:
6426			break;
6427		}
6428		if (i == 3) {
6429			alu.last = 1;
6430		}
6431		r = r600_bytecode_add_alu(ctx->bc, &alu);
6432		if (r)
6433			return r;
6434	}
6435	return 0;
6436}
6437
6438static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
6439						    unsigned index)
6440{
6441	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6442	return 	(inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
6443		inst->Src[index].Register.File != TGSI_FILE_INPUT &&
6444		inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
6445		ctx->src[index].neg || ctx->src[index].abs ||
6446		(inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == TGSI_PROCESSOR_GEOMETRY);
6447}
6448
6449static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
6450					unsigned index)
6451{
6452	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6453	return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
6454}
6455
6456static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
6457{
6458	struct r600_bytecode_vtx vtx;
6459	struct r600_bytecode_alu alu;
6460	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6461	int src_gpr, r, i;
6462	int id = tgsi_tex_get_src_gpr(ctx, 1);
6463
6464	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
6465	if (src_requires_loading) {
6466		for (i = 0; i < 4; i++) {
6467			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6468			alu.op = ALU_OP1_MOV;
6469			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6470			alu.dst.sel = ctx->temp_reg;
6471			alu.dst.chan = i;
6472			if (i == 3)
6473				alu.last = 1;
6474			alu.dst.write = 1;
6475			r = r600_bytecode_add_alu(ctx->bc, &alu);
6476			if (r)
6477				return r;
6478		}
6479		src_gpr = ctx->temp_reg;
6480	}
6481
6482	memset(&vtx, 0, sizeof(vtx));
6483	vtx.op = FETCH_OP_VFETCH;
6484	vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
6485	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
6486	vtx.src_gpr = src_gpr;
6487	vtx.mega_fetch_count = 16;
6488	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
6489	vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;		/* SEL_X */
6490	vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;		/* SEL_Y */
6491	vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;		/* SEL_Z */
6492	vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;		/* SEL_W */
6493	vtx.use_const_fields = 1;
6494
6495	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
6496		return r;
6497
6498	if (ctx->bc->chip_class >= EVERGREEN)
6499		return 0;
6500
6501	for (i = 0; i < 4; i++) {
6502		int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6503		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6504			continue;
6505
6506		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6507		alu.op = ALU_OP2_AND_INT;
6508
6509		alu.dst.chan = i;
6510		alu.dst.sel = vtx.dst_gpr;
6511		alu.dst.write = 1;
6512
6513		alu.src[0].sel = vtx.dst_gpr;
6514		alu.src[0].chan = i;
6515
6516		alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;
6517		alu.src[1].sel += (id * 2);
6518		alu.src[1].chan = i % 4;
6519		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6520
6521		if (i == lasti)
6522			alu.last = 1;
6523		r = r600_bytecode_add_alu(ctx->bc, &alu);
6524		if (r)
6525			return r;
6526	}
6527
6528	if (inst->Dst[0].Register.WriteMask & 3) {
6529		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6530		alu.op = ALU_OP2_OR_INT;
6531
6532		alu.dst.chan = 3;
6533		alu.dst.sel = vtx.dst_gpr;
6534		alu.dst.write = 1;
6535
6536		alu.src[0].sel = vtx.dst_gpr;
6537		alu.src[0].chan = 3;
6538
6539		alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;
6540		alu.src[1].chan = 0;
6541		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6542
6543		alu.last = 1;
6544		r = r600_bytecode_add_alu(ctx->bc, &alu);
6545		if (r)
6546			return r;
6547	}
6548	return 0;
6549}
6550
6551static int r600_do_buffer_txq(struct r600_shader_ctx *ctx)
6552{
6553	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6554	struct r600_bytecode_alu alu;
6555	int r;
6556	int id = tgsi_tex_get_src_gpr(ctx, 1);
6557
6558	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6559	alu.op = ALU_OP1_MOV;
6560	alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
6561	if (ctx->bc->chip_class >= EVERGREEN) {
6562		/* channel 0 or 2 of each word */
6563		alu.src[0].sel += (id / 2);
6564		alu.src[0].chan = (id % 2) * 2;
6565	} else {
6566		/* r600 we have them at channel 2 of the second dword */
6567		alu.src[0].sel += (id * 2) + 1;
6568		alu.src[0].chan = 1;
6569	}
6570	alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6571	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
6572	alu.last = 1;
6573	r = r600_bytecode_add_alu(ctx->bc, &alu);
6574	if (r)
6575		return r;
6576	return 0;
6577}
6578
6579static int tgsi_tex(struct r600_shader_ctx *ctx)
6580{
6581	static float one_point_five = 1.5f;
6582	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6583	struct r600_bytecode_tex tex;
6584	struct r600_bytecode_alu alu;
6585	unsigned src_gpr;
6586	int r, i, j;
6587	int opcode;
6588	bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
6589				    inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
6590				    (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
6591				     inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
6592
6593	bool txf_add_offsets = inst->Texture.NumOffsets &&
6594			     inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
6595			     inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
6596
6597	/* Texture fetch instructions can only use gprs as source.
6598	 * Also they cannot negate the source or take the absolute value */
6599	const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
6600					      inst->Instruction.Opcode != TGSI_OPCODE_TXQS &&
6601                                              tgsi_tex_src_requires_loading(ctx, 0)) ||
6602					     read_compressed_msaa || txf_add_offsets;
6603
6604	boolean src_loaded = FALSE;
6605	unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1;
6606	int8_t offset_x = 0, offset_y = 0, offset_z = 0;
6607	boolean has_txq_cube_array_z = false;
6608	unsigned sampler_index_mode;
6609
6610	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
6611	    ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6612	      inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
6613		if (inst->Dst[0].Register.WriteMask & 4) {
6614			ctx->shader->has_txq_cube_array_z_comp = true;
6615			has_txq_cube_array_z = true;
6616		}
6617
6618	if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
6619	    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
6620	    inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
6621	    inst->Instruction.Opcode == TGSI_OPCODE_TG4)
6622		sampler_src_reg = 2;
6623
6624	/* TGSI moves the sampler to src reg 3 for TXD */
6625	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
6626		sampler_src_reg = 3;
6627
6628	sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
6629
6630	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
6631
6632	if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
6633		if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
6634			ctx->shader->uses_tex_buffers = true;
6635			return r600_do_buffer_txq(ctx);
6636		}
6637		else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
6638			if (ctx->bc->chip_class < EVERGREEN)
6639				ctx->shader->uses_tex_buffers = true;
6640			return do_vtx_fetch_inst(ctx, src_requires_loading);
6641		}
6642	}
6643
6644	if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
6645		int out_chan;
6646		/* Add perspective divide */
6647		if (ctx->bc->chip_class == CAYMAN) {
6648			out_chan = 2;
6649			for (i = 0; i < 3; i++) {
6650				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6651				alu.op = ALU_OP1_RECIP_IEEE;
6652				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6653
6654				alu.dst.sel = ctx->temp_reg;
6655				alu.dst.chan = i;
6656				if (i == 2)
6657					alu.last = 1;
6658				if (out_chan == i)
6659					alu.dst.write = 1;
6660				r = r600_bytecode_add_alu(ctx->bc, &alu);
6661				if (r)
6662					return r;
6663			}
6664
6665		} else {
6666			out_chan = 3;
6667			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6668			alu.op = ALU_OP1_RECIP_IEEE;
6669			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6670
6671			alu.dst.sel = ctx->temp_reg;
6672			alu.dst.chan = out_chan;
6673			alu.last = 1;
6674			alu.dst.write = 1;
6675			r = r600_bytecode_add_alu(ctx->bc, &alu);
6676			if (r)
6677				return r;
6678		}
6679
6680		for (i = 0; i < 3; i++) {
6681			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6682			alu.op = ALU_OP2_MUL;
6683			alu.src[0].sel = ctx->temp_reg;
6684			alu.src[0].chan = out_chan;
6685			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6686			alu.dst.sel = ctx->temp_reg;
6687			alu.dst.chan = i;
6688			alu.dst.write = 1;
6689			r = r600_bytecode_add_alu(ctx->bc, &alu);
6690			if (r)
6691				return r;
6692		}
6693		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6694		alu.op = ALU_OP1_MOV;
6695		alu.src[0].sel = V_SQ_ALU_SRC_1;
6696		alu.src[0].chan = 0;
6697		alu.dst.sel = ctx->temp_reg;
6698		alu.dst.chan = 3;
6699		alu.last = 1;
6700		alu.dst.write = 1;
6701		r = r600_bytecode_add_alu(ctx->bc, &alu);
6702		if (r)
6703			return r;
6704		src_loaded = TRUE;
6705		src_gpr = ctx->temp_reg;
6706	}
6707
6708
6709	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
6710	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6711	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
6712	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
6713	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
6714	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
6715
6716		static const unsigned src0_swizzle[] = {2, 2, 0, 1};
6717		static const unsigned src1_swizzle[] = {1, 0, 2, 2};
6718
6719		/* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
6720		for (i = 0; i < 4; i++) {
6721			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6722			alu.op = ALU_OP2_CUBE;
6723			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
6724			r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
6725			alu.dst.sel = ctx->temp_reg;
6726			alu.dst.chan = i;
6727			if (i == 3)
6728				alu.last = 1;
6729			alu.dst.write = 1;
6730			r = r600_bytecode_add_alu(ctx->bc, &alu);
6731			if (r)
6732				return r;
6733		}
6734
6735		/* tmp1.z = RCP_e(|tmp1.z|) */
6736		if (ctx->bc->chip_class == CAYMAN) {
6737			for (i = 0; i < 3; i++) {
6738				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6739				alu.op = ALU_OP1_RECIP_IEEE;
6740				alu.src[0].sel = ctx->temp_reg;
6741				alu.src[0].chan = 2;
6742				alu.src[0].abs = 1;
6743				alu.dst.sel = ctx->temp_reg;
6744				alu.dst.chan = i;
6745				if (i == 2)
6746					alu.dst.write = 1;
6747				if (i == 2)
6748					alu.last = 1;
6749				r = r600_bytecode_add_alu(ctx->bc, &alu);
6750				if (r)
6751					return r;
6752			}
6753		} else {
6754			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6755			alu.op = ALU_OP1_RECIP_IEEE;
6756			alu.src[0].sel = ctx->temp_reg;
6757			alu.src[0].chan = 2;
6758			alu.src[0].abs = 1;
6759			alu.dst.sel = ctx->temp_reg;
6760			alu.dst.chan = 2;
6761			alu.dst.write = 1;
6762			alu.last = 1;
6763			r = r600_bytecode_add_alu(ctx->bc, &alu);
6764			if (r)
6765				return r;
6766		}
6767
6768		/* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
6769		 * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
6770		 * muladd has no writemask, have to use another temp
6771		 */
6772		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6773		alu.op = ALU_OP3_MULADD;
6774		alu.is_op3 = 1;
6775
6776		alu.src[0].sel = ctx->temp_reg;
6777		alu.src[0].chan = 0;
6778		alu.src[1].sel = ctx->temp_reg;
6779		alu.src[1].chan = 2;
6780
6781		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
6782		alu.src[2].chan = 0;
6783		alu.src[2].value = *(uint32_t *)&one_point_five;
6784
6785		alu.dst.sel = ctx->temp_reg;
6786		alu.dst.chan = 0;
6787		alu.dst.write = 1;
6788
6789		r = r600_bytecode_add_alu(ctx->bc, &alu);
6790		if (r)
6791			return r;
6792
6793		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6794		alu.op = ALU_OP3_MULADD;
6795		alu.is_op3 = 1;
6796
6797		alu.src[0].sel = ctx->temp_reg;
6798		alu.src[0].chan = 1;
6799		alu.src[1].sel = ctx->temp_reg;
6800		alu.src[1].chan = 2;
6801
6802		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
6803		alu.src[2].chan = 0;
6804		alu.src[2].value = *(uint32_t *)&one_point_five;
6805
6806		alu.dst.sel = ctx->temp_reg;
6807		alu.dst.chan = 1;
6808		alu.dst.write = 1;
6809
6810		alu.last = 1;
6811		r = r600_bytecode_add_alu(ctx->bc, &alu);
6812		if (r)
6813			return r;
6814		/* write initial compare value into Z component
6815		  - W src 0 for shadow cube
6816		  - X src 1 for shadow cube array */
6817		if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
6818		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
6819			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6820			alu.op = ALU_OP1_MOV;
6821			if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
6822				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
6823			else
6824				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6825			alu.dst.sel = ctx->temp_reg;
6826			alu.dst.chan = 2;
6827			alu.dst.write = 1;
6828			alu.last = 1;
6829			r = r600_bytecode_add_alu(ctx->bc, &alu);
6830			if (r)
6831				return r;
6832		}
6833
6834		if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6835		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
6836			if (ctx->bc->chip_class >= EVERGREEN) {
6837				int mytmp = r600_get_temp(ctx);
6838				static const float eight = 8.0f;
6839				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6840				alu.op = ALU_OP1_MOV;
6841				alu.src[0].sel = ctx->temp_reg;
6842				alu.src[0].chan = 3;
6843				alu.dst.sel = mytmp;
6844				alu.dst.chan = 0;
6845				alu.dst.write = 1;
6846				alu.last = 1;
6847				r = r600_bytecode_add_alu(ctx->bc, &alu);
6848				if (r)
6849					return r;
6850
6851				/* have to multiply original layer by 8 and add to face id (temp.w) in Z */
6852				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6853				alu.op = ALU_OP3_MULADD;
6854				alu.is_op3 = 1;
6855				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6856				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6857				alu.src[1].chan = 0;
6858				alu.src[1].value = *(uint32_t *)&eight;
6859				alu.src[2].sel = mytmp;
6860				alu.src[2].chan = 0;
6861				alu.dst.sel = ctx->temp_reg;
6862				alu.dst.chan = 3;
6863				alu.dst.write = 1;
6864				alu.last = 1;
6865				r = r600_bytecode_add_alu(ctx->bc, &alu);
6866				if (r)
6867					return r;
6868			} else if (ctx->bc->chip_class < EVERGREEN) {
6869				memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6870				tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
6871				tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6872				tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6873				tex.src_gpr = r600_get_temp(ctx);
6874				tex.src_sel_x = 0;
6875				tex.src_sel_y = 0;
6876				tex.src_sel_z = 0;
6877				tex.src_sel_w = 0;
6878				tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
6879				tex.coord_type_x = 1;
6880				tex.coord_type_y = 1;
6881				tex.coord_type_z = 1;
6882				tex.coord_type_w = 1;
6883				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6884				alu.op = ALU_OP1_MOV;
6885				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6886				alu.dst.sel = tex.src_gpr;
6887				alu.dst.chan = 0;
6888				alu.last = 1;
6889				alu.dst.write = 1;
6890				r = r600_bytecode_add_alu(ctx->bc, &alu);
6891				if (r)
6892					return r;
6893
6894				r = r600_bytecode_add_tex(ctx->bc, &tex);
6895				if (r)
6896					return r;
6897			}
6898
6899		}
6900
6901		/* for cube forms of lod and bias we need to route things */
6902		if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
6903		    inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
6904		    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
6905		    inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
6906			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6907			alu.op = ALU_OP1_MOV;
6908			if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
6909			    inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
6910				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
6911			else
6912				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6913			alu.dst.sel = ctx->temp_reg;
6914			alu.dst.chan = 2;
6915			alu.last = 1;
6916			alu.dst.write = 1;
6917			r = r600_bytecode_add_alu(ctx->bc, &alu);
6918			if (r)
6919				return r;
6920		}
6921
6922		src_loaded = TRUE;
6923		src_gpr = ctx->temp_reg;
6924	}
6925
6926	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
6927		int temp_h = 0, temp_v = 0;
6928		int start_val = 0;
6929
6930		/* if we've already loaded the src (i.e. CUBE don't reload it). */
6931		if (src_loaded == TRUE)
6932			start_val = 1;
6933		else
6934			src_loaded = TRUE;
6935		for (i = start_val; i < 3; i++) {
6936			int treg = r600_get_temp(ctx);
6937
6938			if (i == 0)
6939				src_gpr = treg;
6940			else if (i == 1)
6941				temp_h = treg;
6942			else
6943				temp_v = treg;
6944
6945			for (j = 0; j < 4; j++) {
6946				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6947				alu.op = ALU_OP1_MOV;
6948                                r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
6949                                alu.dst.sel = treg;
6950                                alu.dst.chan = j;
6951                                if (j == 3)
6952                                   alu.last = 1;
6953                                alu.dst.write = 1;
6954                                r = r600_bytecode_add_alu(ctx->bc, &alu);
6955                                if (r)
6956                                    return r;
6957			}
6958		}
6959		for (i = 1; i < 3; i++) {
6960			/* set gradients h/v */
6961			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6962			tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
6963				FETCH_OP_SET_GRADIENTS_V;
6964			tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6965			tex.sampler_index_mode = sampler_index_mode;
6966			tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6967			tex.resource_index_mode = sampler_index_mode;
6968
6969			tex.src_gpr = (i == 1) ? temp_h : temp_v;
6970			tex.src_sel_x = 0;
6971			tex.src_sel_y = 1;
6972			tex.src_sel_z = 2;
6973			tex.src_sel_w = 3;
6974
6975			tex.dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
6976			tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
6977			if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
6978				tex.coord_type_x = 1;
6979				tex.coord_type_y = 1;
6980				tex.coord_type_z = 1;
6981				tex.coord_type_w = 1;
6982			}
6983			r = r600_bytecode_add_tex(ctx->bc, &tex);
6984			if (r)
6985				return r;
6986		}
6987	}
6988
6989	if (src_requires_loading && !src_loaded) {
6990		for (i = 0; i < 4; i++) {
6991			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6992			alu.op = ALU_OP1_MOV;
6993			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6994			alu.dst.sel = ctx->temp_reg;
6995			alu.dst.chan = i;
6996			if (i == 3)
6997				alu.last = 1;
6998			alu.dst.write = 1;
6999			r = r600_bytecode_add_alu(ctx->bc, &alu);
7000			if (r)
7001				return r;
7002		}
7003		src_loaded = TRUE;
7004		src_gpr = ctx->temp_reg;
7005	}
7006
7007	/* get offset values */
7008	if (inst->Texture.NumOffsets) {
7009		assert(inst->Texture.NumOffsets == 1);
7010
7011		/* The texture offset feature doesn't work with the TXF instruction
7012		 * and must be emulated by adding the offset to the texture coordinates. */
7013		if (txf_add_offsets) {
7014			const struct tgsi_texture_offset *off = inst->TexOffsets;
7015
7016			switch (inst->Texture.Texture) {
7017			case TGSI_TEXTURE_3D:
7018				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7019				alu.op = ALU_OP2_ADD_INT;
7020				alu.src[0].sel = src_gpr;
7021				alu.src[0].chan = 2;
7022				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7023				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
7024				alu.dst.sel = src_gpr;
7025				alu.dst.chan = 2;
7026				alu.dst.write = 1;
7027				alu.last = 1;
7028				r = r600_bytecode_add_alu(ctx->bc, &alu);
7029				if (r)
7030					return r;
7031				/* fall through */
7032
7033			case TGSI_TEXTURE_2D:
7034			case TGSI_TEXTURE_SHADOW2D:
7035			case TGSI_TEXTURE_RECT:
7036			case TGSI_TEXTURE_SHADOWRECT:
7037			case TGSI_TEXTURE_2D_ARRAY:
7038			case TGSI_TEXTURE_SHADOW2D_ARRAY:
7039				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7040				alu.op = ALU_OP2_ADD_INT;
7041				alu.src[0].sel = src_gpr;
7042				alu.src[0].chan = 1;
7043				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7044				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
7045				alu.dst.sel = src_gpr;
7046				alu.dst.chan = 1;
7047				alu.dst.write = 1;
7048				alu.last = 1;
7049				r = r600_bytecode_add_alu(ctx->bc, &alu);
7050				if (r)
7051					return r;
7052				/* fall through */
7053
7054			case TGSI_TEXTURE_1D:
7055			case TGSI_TEXTURE_SHADOW1D:
7056			case TGSI_TEXTURE_1D_ARRAY:
7057			case TGSI_TEXTURE_SHADOW1D_ARRAY:
7058				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7059				alu.op = ALU_OP2_ADD_INT;
7060				alu.src[0].sel = src_gpr;
7061				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7062				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
7063				alu.dst.sel = src_gpr;
7064				alu.dst.write = 1;
7065				alu.last = 1;
7066				r = r600_bytecode_add_alu(ctx->bc, &alu);
7067				if (r)
7068					return r;
7069				break;
7070				/* texture offsets do not apply to other texture targets */
7071			}
7072		} else {
7073			switch (inst->Texture.Texture) {
7074			case TGSI_TEXTURE_3D:
7075				offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
7076				/* fallthrough */
7077			case TGSI_TEXTURE_2D:
7078			case TGSI_TEXTURE_SHADOW2D:
7079			case TGSI_TEXTURE_RECT:
7080			case TGSI_TEXTURE_SHADOWRECT:
7081			case TGSI_TEXTURE_2D_ARRAY:
7082			case TGSI_TEXTURE_SHADOW2D_ARRAY:
7083				offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
7084				/* fallthrough */
7085			case TGSI_TEXTURE_1D:
7086			case TGSI_TEXTURE_SHADOW1D:
7087			case TGSI_TEXTURE_1D_ARRAY:
7088			case TGSI_TEXTURE_SHADOW1D_ARRAY:
7089				offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
7090			}
7091		}
7092	}
7093
7094	/* Obtain the sample index for reading a compressed MSAA color texture.
7095	 * To read the FMASK, we use the ldfptr instruction, which tells us
7096	 * where the samples are stored.
7097	 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
7098	 * which is the identity mapping. Each nibble says which physical sample
7099	 * should be fetched to get that sample.
7100	 *
7101	 * Assume src.z contains the sample index. It should be modified like this:
7102	 *   src.z = (ldfptr() >> (src.z * 4)) & 0xF;
7103	 * Then fetch the texel with src.
7104	 */
7105	if (read_compressed_msaa) {
7106		unsigned sample_chan = 3;
7107		unsigned temp = r600_get_temp(ctx);
7108		assert(src_loaded);
7109
7110		/* temp.w = ldfptr() */
7111		memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7112		tex.op = FETCH_OP_LD;
7113		tex.inst_mod = 1; /* to indicate this is ldfptr */
7114		tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7115		tex.sampler_index_mode = sampler_index_mode;
7116		tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7117		tex.resource_index_mode = sampler_index_mode;
7118		tex.src_gpr = src_gpr;
7119		tex.dst_gpr = temp;
7120		tex.dst_sel_x = 7; /* mask out these components */
7121		tex.dst_sel_y = 7;
7122		tex.dst_sel_z = 7;
7123		tex.dst_sel_w = 0; /* store X */
7124		tex.src_sel_x = 0;
7125		tex.src_sel_y = 1;
7126		tex.src_sel_z = 2;
7127		tex.src_sel_w = 3;
7128		tex.offset_x = offset_x;
7129		tex.offset_y = offset_y;
7130		tex.offset_z = offset_z;
7131		r = r600_bytecode_add_tex(ctx->bc, &tex);
7132		if (r)
7133			return r;
7134
7135		/* temp.x = sample_index*4 */
7136		if (ctx->bc->chip_class == CAYMAN) {
7137			for (i = 0 ; i < 4; i++) {
7138				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7139				alu.op = ALU_OP2_MULLO_INT;
7140				alu.src[0].sel = src_gpr;
7141				alu.src[0].chan = sample_chan;
7142				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7143				alu.src[1].value = 4;
7144				alu.dst.sel = temp;
7145				alu.dst.chan = i;
7146				alu.dst.write = i == 0;
7147				if (i == 3)
7148					alu.last = 1;
7149				r = r600_bytecode_add_alu(ctx->bc, &alu);
7150				if (r)
7151					return r;
7152			}
7153		} else {
7154			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7155			alu.op = ALU_OP2_MULLO_INT;
7156			alu.src[0].sel = src_gpr;
7157			alu.src[0].chan = sample_chan;
7158			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7159			alu.src[1].value = 4;
7160			alu.dst.sel = temp;
7161			alu.dst.chan = 0;
7162			alu.dst.write = 1;
7163			alu.last = 1;
7164			r = r600_bytecode_add_alu(ctx->bc, &alu);
7165			if (r)
7166				return r;
7167		}
7168
7169		/* sample_index = temp.w >> temp.x */
7170		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7171		alu.op = ALU_OP2_LSHR_INT;
7172		alu.src[0].sel = temp;
7173		alu.src[0].chan = 3;
7174		alu.src[1].sel = temp;
7175		alu.src[1].chan = 0;
7176		alu.dst.sel = src_gpr;
7177		alu.dst.chan = sample_chan;
7178		alu.dst.write = 1;
7179		alu.last = 1;
7180		r = r600_bytecode_add_alu(ctx->bc, &alu);
7181		if (r)
7182			return r;
7183
7184		/* sample_index & 0xF */
7185		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7186		alu.op = ALU_OP2_AND_INT;
7187		alu.src[0].sel = src_gpr;
7188		alu.src[0].chan = sample_chan;
7189		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7190		alu.src[1].value = 0xF;
7191		alu.dst.sel = src_gpr;
7192		alu.dst.chan = sample_chan;
7193		alu.dst.write = 1;
7194		alu.last = 1;
7195		r = r600_bytecode_add_alu(ctx->bc, &alu);
7196		if (r)
7197			return r;
7198#if 0
7199		/* visualize the FMASK */
7200		for (i = 0; i < 4; i++) {
7201			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7202			alu.op = ALU_OP1_INT_TO_FLT;
7203			alu.src[0].sel = src_gpr;
7204			alu.src[0].chan = sample_chan;
7205			alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7206			alu.dst.chan = i;
7207			alu.dst.write = 1;
7208			alu.last = 1;
7209			r = r600_bytecode_add_alu(ctx->bc, &alu);
7210			if (r)
7211				return r;
7212		}
7213		return 0;
7214#endif
7215	}
7216
7217	/* does this shader want a num layers from TXQ for a cube array? */
7218	if (has_txq_cube_array_z) {
7219		int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7220
7221		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7222		alu.op = ALU_OP1_MOV;
7223
7224		alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
7225		if (ctx->bc->chip_class >= EVERGREEN) {
7226			/* channel 1 or 3 of each word */
7227			alu.src[0].sel += (id / 2);
7228			alu.src[0].chan = ((id % 2) * 2) + 1;
7229		} else {
7230			/* r600 we have them at channel 2 of the second dword */
7231			alu.src[0].sel += (id * 2) + 1;
7232			alu.src[0].chan = 2;
7233		}
7234		alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7235		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
7236		alu.last = 1;
7237		r = r600_bytecode_add_alu(ctx->bc, &alu);
7238		if (r)
7239			return r;
7240		/* disable writemask from texture instruction */
7241		inst->Dst[0].Register.WriteMask &= ~4;
7242	}
7243
7244	opcode = ctx->inst_info->op;
7245	if (opcode == FETCH_OP_GATHER4 &&
7246		inst->TexOffsets[0].File != TGSI_FILE_NULL &&
7247		inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
7248		opcode = FETCH_OP_GATHER4_O;
7249
7250		/* GATHER4_O/GATHER4_C_O use offset values loaded by
7251		   SET_TEXTURE_OFFSETS instruction. The immediate offset values
7252		   encoded in the instruction are ignored. */
7253		memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7254		tex.op = FETCH_OP_SET_TEXTURE_OFFSETS;
7255		tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7256		tex.sampler_index_mode = sampler_index_mode;
7257		tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7258		tex.resource_index_mode = sampler_index_mode;
7259
7260		tex.src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
7261		tex.src_sel_x = inst->TexOffsets[0].SwizzleX;
7262		tex.src_sel_y = inst->TexOffsets[0].SwizzleY;
7263		tex.src_sel_z = inst->TexOffsets[0].SwizzleZ;
7264		tex.src_sel_w = 4;
7265
7266		tex.dst_sel_x = 7;
7267		tex.dst_sel_y = 7;
7268		tex.dst_sel_z = 7;
7269		tex.dst_sel_w = 7;
7270
7271		r = r600_bytecode_add_tex(ctx->bc, &tex);
7272		if (r)
7273			return r;
7274	}
7275
7276	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
7277	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
7278	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
7279	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7280	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
7281	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
7282	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7283		switch (opcode) {
7284		case FETCH_OP_SAMPLE:
7285			opcode = FETCH_OP_SAMPLE_C;
7286			break;
7287		case FETCH_OP_SAMPLE_L:
7288			opcode = FETCH_OP_SAMPLE_C_L;
7289			break;
7290		case FETCH_OP_SAMPLE_LB:
7291			opcode = FETCH_OP_SAMPLE_C_LB;
7292			break;
7293		case FETCH_OP_SAMPLE_G:
7294			opcode = FETCH_OP_SAMPLE_C_G;
7295			break;
7296		/* Texture gather variants */
7297		case FETCH_OP_GATHER4:
7298			opcode = FETCH_OP_GATHER4_C;
7299			break;
7300		case FETCH_OP_GATHER4_O:
7301			opcode = FETCH_OP_GATHER4_C_O;
7302			break;
7303		}
7304	}
7305
7306	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7307	tex.op = opcode;
7308
7309	tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7310	tex.sampler_index_mode = sampler_index_mode;
7311	tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7312	tex.resource_index_mode = sampler_index_mode;
7313	tex.src_gpr = src_gpr;
7314	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7315
7316	if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
7317		inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
7318		tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
7319	}
7320
7321	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
7322		int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
7323		tex.inst_mod = texture_component_select;
7324
7325		if (ctx->bc->chip_class == CAYMAN) {
7326		/* GATHER4 result order is different from TGSI TG4 */
7327			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 0 : 7;
7328			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 1 : 7;
7329			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 2 : 7;
7330			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7331		} else {
7332			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7333			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
7334			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7335			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7336		}
7337	}
7338	else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
7339		tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7340		tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7341		tex.dst_sel_z = 7;
7342		tex.dst_sel_w = 7;
7343	}
7344	else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
7345		tex.dst_sel_x = 3;
7346		tex.dst_sel_y = 7;
7347		tex.dst_sel_z = 7;
7348		tex.dst_sel_w = 7;
7349	}
7350	else {
7351		tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7352		tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7353		tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
7354		tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7355	}
7356
7357
7358	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ||
7359	    inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
7360		tex.src_sel_x = 4;
7361		tex.src_sel_y = 4;
7362		tex.src_sel_z = 4;
7363		tex.src_sel_w = 4;
7364	} else if (src_loaded) {
7365		tex.src_sel_x = 0;
7366		tex.src_sel_y = 1;
7367		tex.src_sel_z = 2;
7368		tex.src_sel_w = 3;
7369	} else {
7370		tex.src_sel_x = ctx->src[0].swizzle[0];
7371		tex.src_sel_y = ctx->src[0].swizzle[1];
7372		tex.src_sel_z = ctx->src[0].swizzle[2];
7373		tex.src_sel_w = ctx->src[0].swizzle[3];
7374		tex.src_rel = ctx->src[0].rel;
7375	}
7376
7377	if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
7378	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7379	    inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7380	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7381		tex.src_sel_x = 1;
7382		tex.src_sel_y = 0;
7383		tex.src_sel_z = 3;
7384		tex.src_sel_w = 2; /* route Z compare or Lod value into W */
7385	}
7386
7387	if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
7388	    inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
7389		tex.coord_type_x = 1;
7390		tex.coord_type_y = 1;
7391	}
7392	tex.coord_type_z = 1;
7393	tex.coord_type_w = 1;
7394
7395	tex.offset_x = offset_x;
7396	tex.offset_y = offset_y;
7397	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
7398		(inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
7399		 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
7400		tex.offset_z = 0;
7401	}
7402	else {
7403		tex.offset_z = offset_z;
7404	}
7405
7406	/* Put the depth for comparison in W.
7407	 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
7408	 * Some instructions expect the depth in Z. */
7409	if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
7410	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
7411	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
7412	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
7413	    opcode != FETCH_OP_SAMPLE_C_L &&
7414	    opcode != FETCH_OP_SAMPLE_C_LB) {
7415		tex.src_sel_w = tex.src_sel_z;
7416	}
7417
7418	if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
7419	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
7420		if (opcode == FETCH_OP_SAMPLE_C_L ||
7421		    opcode == FETCH_OP_SAMPLE_C_LB) {
7422			/* the array index is read from Y */
7423			tex.coord_type_y = 0;
7424		} else {
7425			/* the array index is read from Z */
7426			tex.coord_type_z = 0;
7427			tex.src_sel_z = tex.src_sel_y;
7428		}
7429	} else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
7430		   inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
7431		   ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7432		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
7433		    (ctx->bc->chip_class >= EVERGREEN)))
7434		/* the array index is read from Z */
7435		tex.coord_type_z = 0;
7436
7437	/* mask unused source components */
7438	if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
7439		switch (inst->Texture.Texture) {
7440		case TGSI_TEXTURE_2D:
7441		case TGSI_TEXTURE_RECT:
7442			tex.src_sel_z = 7;
7443			tex.src_sel_w = 7;
7444			break;
7445		case TGSI_TEXTURE_1D_ARRAY:
7446			tex.src_sel_y = 7;
7447			tex.src_sel_w = 7;
7448			break;
7449		case TGSI_TEXTURE_1D:
7450			tex.src_sel_y = 7;
7451			tex.src_sel_z = 7;
7452			tex.src_sel_w = 7;
7453			break;
7454		}
7455	}
7456
7457	r = r600_bytecode_add_tex(ctx->bc, &tex);
7458	if (r)
7459		return r;
7460
7461	/* add shadow ambient support  - gallium doesn't do it yet */
7462	return 0;
7463}
7464
7465static int tgsi_lrp(struct r600_shader_ctx *ctx)
7466{
7467	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7468	struct r600_bytecode_alu alu;
7469	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7470	unsigned i, temp_regs[2];
7471	int r;
7472
7473	/* optimize if it's just an equal balance */
7474	if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
7475		for (i = 0; i < lasti + 1; i++) {
7476			if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7477				continue;
7478
7479			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7480			alu.op = ALU_OP2_ADD;
7481			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
7482			r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
7483			alu.omod = 3;
7484			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7485			alu.dst.chan = i;
7486			if (i == lasti) {
7487				alu.last = 1;
7488			}
7489			r = r600_bytecode_add_alu(ctx->bc, &alu);
7490			if (r)
7491				return r;
7492		}
7493		return 0;
7494	}
7495
7496	/* 1 - src0 */
7497	for (i = 0; i < lasti + 1; i++) {
7498		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7499			continue;
7500
7501		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7502		alu.op = ALU_OP2_ADD;
7503		alu.src[0].sel = V_SQ_ALU_SRC_1;
7504		alu.src[0].chan = 0;
7505		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
7506		r600_bytecode_src_toggle_neg(&alu.src[1]);
7507		alu.dst.sel = ctx->temp_reg;
7508		alu.dst.chan = i;
7509		if (i == lasti) {
7510			alu.last = 1;
7511		}
7512		alu.dst.write = 1;
7513		r = r600_bytecode_add_alu(ctx->bc, &alu);
7514		if (r)
7515			return r;
7516	}
7517
7518	/* (1 - src0) * src2 */
7519	for (i = 0; i < lasti + 1; i++) {
7520		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7521			continue;
7522
7523		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7524		alu.op = ALU_OP2_MUL;
7525		alu.src[0].sel = ctx->temp_reg;
7526		alu.src[0].chan = i;
7527		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
7528		alu.dst.sel = ctx->temp_reg;
7529		alu.dst.chan = i;
7530		if (i == lasti) {
7531			alu.last = 1;
7532		}
7533		alu.dst.write = 1;
7534		r = r600_bytecode_add_alu(ctx->bc, &alu);
7535		if (r)
7536			return r;
7537	}
7538
7539	/* src0 * src1 + (1 - src0) * src2 */
7540        if (ctx->src[0].abs)
7541		temp_regs[0] = r600_get_temp(ctx);
7542	else
7543		temp_regs[0] = 0;
7544	if (ctx->src[1].abs)
7545		temp_regs[1] = r600_get_temp(ctx);
7546	else
7547		temp_regs[1] = 0;
7548
7549	for (i = 0; i < lasti + 1; i++) {
7550		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7551			continue;
7552
7553		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7554		alu.op = ALU_OP3_MULADD;
7555		alu.is_op3 = 1;
7556		r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
7557		if (r)
7558			return r;
7559		r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[1], &ctx->src[1]);
7560		if (r)
7561			return r;
7562		alu.src[2].sel = ctx->temp_reg;
7563		alu.src[2].chan = i;
7564
7565		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7566		alu.dst.chan = i;
7567		if (i == lasti) {
7568			alu.last = 1;
7569		}
7570		r = r600_bytecode_add_alu(ctx->bc, &alu);
7571		if (r)
7572			return r;
7573	}
7574	return 0;
7575}
7576
7577static int tgsi_cmp(struct r600_shader_ctx *ctx)
7578{
7579	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7580	struct r600_bytecode_alu alu;
7581	int i, r, j;
7582	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7583	int temp_regs[3];
7584
7585	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7586		temp_regs[j] = 0;
7587		if (ctx->src[j].abs)
7588			temp_regs[j] = r600_get_temp(ctx);
7589	}
7590
7591	for (i = 0; i < lasti + 1; i++) {
7592		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7593			continue;
7594
7595		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7596		alu.op = ALU_OP3_CNDGE;
7597		r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
7598		if (r)
7599			return r;
7600		r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[1], &ctx->src[2]);
7601		if (r)
7602			return r;
7603		r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[2], &ctx->src[1]);
7604		if (r)
7605			return r;
7606		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7607		alu.dst.chan = i;
7608		alu.dst.write = 1;
7609		alu.is_op3 = 1;
7610		if (i == lasti)
7611			alu.last = 1;
7612		r = r600_bytecode_add_alu(ctx->bc, &alu);
7613		if (r)
7614			return r;
7615	}
7616	return 0;
7617}
7618
7619static int tgsi_ucmp(struct r600_shader_ctx *ctx)
7620{
7621	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7622	struct r600_bytecode_alu alu;
7623	int i, r;
7624	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7625
7626	for (i = 0; i < lasti + 1; i++) {
7627		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7628			continue;
7629
7630		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7631		alu.op = ALU_OP3_CNDE_INT;
7632		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7633		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
7634		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
7635		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7636		alu.dst.chan = i;
7637		alu.dst.write = 1;
7638		alu.is_op3 = 1;
7639		if (i == lasti)
7640			alu.last = 1;
7641		r = r600_bytecode_add_alu(ctx->bc, &alu);
7642		if (r)
7643			return r;
7644	}
7645	return 0;
7646}
7647
7648static int tgsi_xpd(struct r600_shader_ctx *ctx)
7649{
7650	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7651	static const unsigned int src0_swizzle[] = {2, 0, 1};
7652	static const unsigned int src1_swizzle[] = {1, 2, 0};
7653	struct r600_bytecode_alu alu;
7654	uint32_t use_temp = 0;
7655	int i, r;
7656
7657	if (inst->Dst[0].Register.WriteMask != 0xf)
7658		use_temp = 1;
7659
7660	for (i = 0; i < 4; i++) {
7661		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7662		alu.op = ALU_OP2_MUL;
7663		if (i < 3) {
7664			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
7665			r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
7666		} else {
7667			alu.src[0].sel = V_SQ_ALU_SRC_0;
7668			alu.src[0].chan = i;
7669			alu.src[1].sel = V_SQ_ALU_SRC_0;
7670			alu.src[1].chan = i;
7671		}
7672
7673		alu.dst.sel = ctx->temp_reg;
7674		alu.dst.chan = i;
7675		alu.dst.write = 1;
7676
7677		if (i == 3)
7678			alu.last = 1;
7679		r = r600_bytecode_add_alu(ctx->bc, &alu);
7680		if (r)
7681			return r;
7682	}
7683
7684	for (i = 0; i < 4; i++) {
7685		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7686		alu.op = ALU_OP3_MULADD;
7687
7688		if (i < 3) {
7689			r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
7690			r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
7691		} else {
7692			alu.src[0].sel = V_SQ_ALU_SRC_0;
7693			alu.src[0].chan = i;
7694			alu.src[1].sel = V_SQ_ALU_SRC_0;
7695			alu.src[1].chan = i;
7696		}
7697
7698		alu.src[2].sel = ctx->temp_reg;
7699		alu.src[2].neg = 1;
7700		alu.src[2].chan = i;
7701
7702		if (use_temp)
7703			alu.dst.sel = ctx->temp_reg;
7704		else
7705			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7706		alu.dst.chan = i;
7707		alu.dst.write = 1;
7708		alu.is_op3 = 1;
7709		if (i == 3)
7710			alu.last = 1;
7711		r = r600_bytecode_add_alu(ctx->bc, &alu);
7712		if (r)
7713			return r;
7714	}
7715	if (use_temp)
7716		return tgsi_helper_copy(ctx, inst);
7717	return 0;
7718}
7719
7720static int tgsi_exp(struct r600_shader_ctx *ctx)
7721{
7722	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7723	struct r600_bytecode_alu alu;
7724	int r;
7725	int i;
7726
7727	/* result.x = 2^floor(src); */
7728	if (inst->Dst[0].Register.WriteMask & 1) {
7729		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7730
7731		alu.op = ALU_OP1_FLOOR;
7732		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7733
7734		alu.dst.sel = ctx->temp_reg;
7735		alu.dst.chan = 0;
7736		alu.dst.write = 1;
7737		alu.last = 1;
7738		r = r600_bytecode_add_alu(ctx->bc, &alu);
7739		if (r)
7740			return r;
7741
7742		if (ctx->bc->chip_class == CAYMAN) {
7743			for (i = 0; i < 3; i++) {
7744				alu.op = ALU_OP1_EXP_IEEE;
7745				alu.src[0].sel = ctx->temp_reg;
7746				alu.src[0].chan = 0;
7747
7748				alu.dst.sel = ctx->temp_reg;
7749				alu.dst.chan = i;
7750				alu.dst.write = i == 0;
7751				alu.last = i == 2;
7752				r = r600_bytecode_add_alu(ctx->bc, &alu);
7753				if (r)
7754					return r;
7755			}
7756		} else {
7757			alu.op = ALU_OP1_EXP_IEEE;
7758			alu.src[0].sel = ctx->temp_reg;
7759			alu.src[0].chan = 0;
7760
7761			alu.dst.sel = ctx->temp_reg;
7762			alu.dst.chan = 0;
7763			alu.dst.write = 1;
7764			alu.last = 1;
7765			r = r600_bytecode_add_alu(ctx->bc, &alu);
7766			if (r)
7767				return r;
7768		}
7769	}
7770
7771	/* result.y = tmp - floor(tmp); */
7772	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
7773		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7774
7775		alu.op = ALU_OP1_FRACT;
7776		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7777
7778		alu.dst.sel = ctx->temp_reg;
7779#if 0
7780		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7781		if (r)
7782			return r;
7783#endif
7784		alu.dst.write = 1;
7785		alu.dst.chan = 1;
7786
7787		alu.last = 1;
7788
7789		r = r600_bytecode_add_alu(ctx->bc, &alu);
7790		if (r)
7791			return r;
7792	}
7793
7794	/* result.z = RoughApprox2ToX(tmp);*/
7795	if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
7796		if (ctx->bc->chip_class == CAYMAN) {
7797			for (i = 0; i < 3; i++) {
7798				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7799				alu.op = ALU_OP1_EXP_IEEE;
7800				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7801
7802				alu.dst.sel = ctx->temp_reg;
7803				alu.dst.chan = i;
7804				if (i == 2) {
7805					alu.dst.write = 1;
7806					alu.last = 1;
7807				}
7808
7809				r = r600_bytecode_add_alu(ctx->bc, &alu);
7810				if (r)
7811					return r;
7812			}
7813		} else {
7814			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7815			alu.op = ALU_OP1_EXP_IEEE;
7816			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7817
7818			alu.dst.sel = ctx->temp_reg;
7819			alu.dst.write = 1;
7820			alu.dst.chan = 2;
7821
7822			alu.last = 1;
7823
7824			r = r600_bytecode_add_alu(ctx->bc, &alu);
7825			if (r)
7826				return r;
7827		}
7828	}
7829
7830	/* result.w = 1.0;*/
7831	if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
7832		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7833
7834		alu.op = ALU_OP1_MOV;
7835		alu.src[0].sel = V_SQ_ALU_SRC_1;
7836		alu.src[0].chan = 0;
7837
7838		alu.dst.sel = ctx->temp_reg;
7839		alu.dst.chan = 3;
7840		alu.dst.write = 1;
7841		alu.last = 1;
7842		r = r600_bytecode_add_alu(ctx->bc, &alu);
7843		if (r)
7844			return r;
7845	}
7846	return tgsi_helper_copy(ctx, inst);
7847}
7848
7849static int tgsi_log(struct r600_shader_ctx *ctx)
7850{
7851	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7852	struct r600_bytecode_alu alu;
7853	int r;
7854	int i;
7855
7856	/* result.x = floor(log2(|src|)); */
7857	if (inst->Dst[0].Register.WriteMask & 1) {
7858		if (ctx->bc->chip_class == CAYMAN) {
7859			for (i = 0; i < 3; i++) {
7860				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7861
7862				alu.op = ALU_OP1_LOG_IEEE;
7863				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7864				r600_bytecode_src_set_abs(&alu.src[0]);
7865
7866				alu.dst.sel = ctx->temp_reg;
7867				alu.dst.chan = i;
7868				if (i == 0)
7869					alu.dst.write = 1;
7870				if (i == 2)
7871					alu.last = 1;
7872				r = r600_bytecode_add_alu(ctx->bc, &alu);
7873				if (r)
7874					return r;
7875			}
7876
7877		} else {
7878			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7879
7880			alu.op = ALU_OP1_LOG_IEEE;
7881			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7882			r600_bytecode_src_set_abs(&alu.src[0]);
7883
7884			alu.dst.sel = ctx->temp_reg;
7885			alu.dst.chan = 0;
7886			alu.dst.write = 1;
7887			alu.last = 1;
7888			r = r600_bytecode_add_alu(ctx->bc, &alu);
7889			if (r)
7890				return r;
7891		}
7892
7893		alu.op = ALU_OP1_FLOOR;
7894		alu.src[0].sel = ctx->temp_reg;
7895		alu.src[0].chan = 0;
7896
7897		alu.dst.sel = ctx->temp_reg;
7898		alu.dst.chan = 0;
7899		alu.dst.write = 1;
7900		alu.last = 1;
7901
7902		r = r600_bytecode_add_alu(ctx->bc, &alu);
7903		if (r)
7904			return r;
7905	}
7906
7907	/* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
7908	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
7909
7910		if (ctx->bc->chip_class == CAYMAN) {
7911			for (i = 0; i < 3; i++) {
7912				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7913
7914				alu.op = ALU_OP1_LOG_IEEE;
7915				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7916				r600_bytecode_src_set_abs(&alu.src[0]);
7917
7918				alu.dst.sel = ctx->temp_reg;
7919				alu.dst.chan = i;
7920				if (i == 1)
7921					alu.dst.write = 1;
7922				if (i == 2)
7923					alu.last = 1;
7924
7925				r = r600_bytecode_add_alu(ctx->bc, &alu);
7926				if (r)
7927					return r;
7928			}
7929		} else {
7930			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7931
7932			alu.op = ALU_OP1_LOG_IEEE;
7933			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7934			r600_bytecode_src_set_abs(&alu.src[0]);
7935
7936			alu.dst.sel = ctx->temp_reg;
7937			alu.dst.chan = 1;
7938			alu.dst.write = 1;
7939			alu.last = 1;
7940
7941			r = r600_bytecode_add_alu(ctx->bc, &alu);
7942			if (r)
7943				return r;
7944		}
7945
7946		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7947
7948		alu.op = ALU_OP1_FLOOR;
7949		alu.src[0].sel = ctx->temp_reg;
7950		alu.src[0].chan = 1;
7951
7952		alu.dst.sel = ctx->temp_reg;
7953		alu.dst.chan = 1;
7954		alu.dst.write = 1;
7955		alu.last = 1;
7956
7957		r = r600_bytecode_add_alu(ctx->bc, &alu);
7958		if (r)
7959			return r;
7960
7961		if (ctx->bc->chip_class == CAYMAN) {
7962			for (i = 0; i < 3; i++) {
7963				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7964				alu.op = ALU_OP1_EXP_IEEE;
7965				alu.src[0].sel = ctx->temp_reg;
7966				alu.src[0].chan = 1;
7967
7968				alu.dst.sel = ctx->temp_reg;
7969				alu.dst.chan = i;
7970				if (i == 1)
7971					alu.dst.write = 1;
7972				if (i == 2)
7973					alu.last = 1;
7974
7975				r = r600_bytecode_add_alu(ctx->bc, &alu);
7976				if (r)
7977					return r;
7978			}
7979		} else {
7980			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7981			alu.op = ALU_OP1_EXP_IEEE;
7982			alu.src[0].sel = ctx->temp_reg;
7983			alu.src[0].chan = 1;
7984
7985			alu.dst.sel = ctx->temp_reg;
7986			alu.dst.chan = 1;
7987			alu.dst.write = 1;
7988			alu.last = 1;
7989
7990			r = r600_bytecode_add_alu(ctx->bc, &alu);
7991			if (r)
7992				return r;
7993		}
7994
7995		if (ctx->bc->chip_class == CAYMAN) {
7996			for (i = 0; i < 3; i++) {
7997				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7998				alu.op = ALU_OP1_RECIP_IEEE;
7999				alu.src[0].sel = ctx->temp_reg;
8000				alu.src[0].chan = 1;
8001
8002				alu.dst.sel = ctx->temp_reg;
8003				alu.dst.chan = i;
8004				if (i == 1)
8005					alu.dst.write = 1;
8006				if (i == 2)
8007					alu.last = 1;
8008
8009				r = r600_bytecode_add_alu(ctx->bc, &alu);
8010				if (r)
8011					return r;
8012			}
8013		} else {
8014			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8015			alu.op = ALU_OP1_RECIP_IEEE;
8016			alu.src[0].sel = ctx->temp_reg;
8017			alu.src[0].chan = 1;
8018
8019			alu.dst.sel = ctx->temp_reg;
8020			alu.dst.chan = 1;
8021			alu.dst.write = 1;
8022			alu.last = 1;
8023
8024			r = r600_bytecode_add_alu(ctx->bc, &alu);
8025			if (r)
8026				return r;
8027		}
8028
8029		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8030
8031		alu.op = ALU_OP2_MUL;
8032
8033		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8034		r600_bytecode_src_set_abs(&alu.src[0]);
8035
8036		alu.src[1].sel = ctx->temp_reg;
8037		alu.src[1].chan = 1;
8038
8039		alu.dst.sel = ctx->temp_reg;
8040		alu.dst.chan = 1;
8041		alu.dst.write = 1;
8042		alu.last = 1;
8043
8044		r = r600_bytecode_add_alu(ctx->bc, &alu);
8045		if (r)
8046			return r;
8047	}
8048
8049	/* result.z = log2(|src|);*/
8050	if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
8051		if (ctx->bc->chip_class == CAYMAN) {
8052			for (i = 0; i < 3; i++) {
8053				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8054
8055				alu.op = ALU_OP1_LOG_IEEE;
8056				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8057				r600_bytecode_src_set_abs(&alu.src[0]);
8058
8059				alu.dst.sel = ctx->temp_reg;
8060				if (i == 2)
8061					alu.dst.write = 1;
8062				alu.dst.chan = i;
8063				if (i == 2)
8064					alu.last = 1;
8065
8066				r = r600_bytecode_add_alu(ctx->bc, &alu);
8067				if (r)
8068					return r;
8069			}
8070		} else {
8071			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8072
8073			alu.op = ALU_OP1_LOG_IEEE;
8074			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8075			r600_bytecode_src_set_abs(&alu.src[0]);
8076
8077			alu.dst.sel = ctx->temp_reg;
8078			alu.dst.write = 1;
8079			alu.dst.chan = 2;
8080			alu.last = 1;
8081
8082			r = r600_bytecode_add_alu(ctx->bc, &alu);
8083			if (r)
8084				return r;
8085		}
8086	}
8087
8088	/* result.w = 1.0; */
8089	if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
8090		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8091
8092		alu.op = ALU_OP1_MOV;
8093		alu.src[0].sel = V_SQ_ALU_SRC_1;
8094		alu.src[0].chan = 0;
8095
8096		alu.dst.sel = ctx->temp_reg;
8097		alu.dst.chan = 3;
8098		alu.dst.write = 1;
8099		alu.last = 1;
8100
8101		r = r600_bytecode_add_alu(ctx->bc, &alu);
8102		if (r)
8103			return r;
8104	}
8105
8106	return tgsi_helper_copy(ctx, inst);
8107}
8108
8109static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
8110{
8111	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8112	struct r600_bytecode_alu alu;
8113	int r;
8114	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8115	unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
8116
8117	assert(inst->Dst[0].Register.Index < 3);
8118	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8119
8120	switch (inst->Instruction.Opcode) {
8121	case TGSI_OPCODE_ARL:
8122		alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
8123		break;
8124	case TGSI_OPCODE_ARR:
8125		alu.op = ALU_OP1_FLT_TO_INT;
8126		break;
8127	case TGSI_OPCODE_UARL:
8128		alu.op = ALU_OP1_MOV;
8129		break;
8130	default:
8131		assert(0);
8132		return -1;
8133	}
8134
8135	for (i = 0; i <= lasti; ++i) {
8136		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8137			continue;
8138		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8139		alu.last = i == lasti;
8140		alu.dst.sel = reg;
8141	        alu.dst.chan = i;
8142		alu.dst.write = 1;
8143		r = r600_bytecode_add_alu(ctx->bc, &alu);
8144		if (r)
8145			return r;
8146	}
8147
8148	if (inst->Dst[0].Register.Index > 0)
8149		ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
8150	else
8151		ctx->bc->ar_loaded = 0;
8152
8153	return 0;
8154}
8155static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
8156{
8157	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8158	struct r600_bytecode_alu alu;
8159	int r;
8160	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8161
8162	switch (inst->Instruction.Opcode) {
8163	case TGSI_OPCODE_ARL:
8164		memset(&alu, 0, sizeof(alu));
8165		alu.op = ALU_OP1_FLOOR;
8166		alu.dst.sel = ctx->bc->ar_reg;
8167		alu.dst.write = 1;
8168		for (i = 0; i <= lasti; ++i) {
8169			if (inst->Dst[0].Register.WriteMask & (1 << i))  {
8170				alu.dst.chan = i;
8171				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8172				alu.last = i == lasti;
8173				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
8174					return r;
8175			}
8176		}
8177
8178		memset(&alu, 0, sizeof(alu));
8179		alu.op = ALU_OP1_FLT_TO_INT;
8180		alu.src[0].sel = ctx->bc->ar_reg;
8181		alu.dst.sel = ctx->bc->ar_reg;
8182		alu.dst.write = 1;
8183		/* FLT_TO_INT is trans-only on r600/r700 */
8184		alu.last = TRUE;
8185		for (i = 0; i <= lasti; ++i) {
8186			alu.dst.chan = i;
8187			alu.src[0].chan = i;
8188			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
8189				return r;
8190		}
8191		break;
8192	case TGSI_OPCODE_ARR:
8193		memset(&alu, 0, sizeof(alu));
8194		alu.op = ALU_OP1_FLT_TO_INT;
8195		alu.dst.sel = ctx->bc->ar_reg;
8196		alu.dst.write = 1;
8197		/* FLT_TO_INT is trans-only on r600/r700 */
8198		alu.last = TRUE;
8199		for (i = 0; i <= lasti; ++i) {
8200			if (inst->Dst[0].Register.WriteMask & (1 << i)) {
8201				alu.dst.chan = i;
8202				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8203				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
8204					return r;
8205			}
8206		}
8207		break;
8208	case TGSI_OPCODE_UARL:
8209		memset(&alu, 0, sizeof(alu));
8210		alu.op = ALU_OP1_MOV;
8211		alu.dst.sel = ctx->bc->ar_reg;
8212		alu.dst.write = 1;
8213		for (i = 0; i <= lasti; ++i) {
8214			if (inst->Dst[0].Register.WriteMask & (1 << i)) {
8215				alu.dst.chan = i;
8216				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8217				alu.last = i == lasti;
8218				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
8219					return r;
8220			}
8221		}
8222		break;
8223	default:
8224		assert(0);
8225		return -1;
8226	}
8227
8228	ctx->bc->ar_loaded = 0;
8229	return 0;
8230}
8231
8232static int tgsi_opdst(struct r600_shader_ctx *ctx)
8233{
8234	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8235	struct r600_bytecode_alu alu;
8236	int i, r = 0;
8237
8238	for (i = 0; i < 4; i++) {
8239		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8240
8241		alu.op = ALU_OP2_MUL;
8242		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8243
8244		if (i == 0 || i == 3) {
8245			alu.src[0].sel = V_SQ_ALU_SRC_1;
8246		} else {
8247			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8248		}
8249
8250		if (i == 0 || i == 2) {
8251			alu.src[1].sel = V_SQ_ALU_SRC_1;
8252		} else {
8253			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
8254		}
8255		if (i == 3)
8256			alu.last = 1;
8257		r = r600_bytecode_add_alu(ctx->bc, &alu);
8258		if (r)
8259			return r;
8260	}
8261	return 0;
8262}
8263
8264static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type)
8265{
8266	struct r600_bytecode_alu alu;
8267	int r;
8268
8269	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8270	alu.op = opcode;
8271	alu.execute_mask = 1;
8272	alu.update_pred = 1;
8273
8274	alu.dst.sel = ctx->temp_reg;
8275	alu.dst.write = 1;
8276	alu.dst.chan = 0;
8277
8278	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8279	alu.src[1].sel = V_SQ_ALU_SRC_0;
8280	alu.src[1].chan = 0;
8281
8282	alu.last = 1;
8283
8284	r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
8285	if (r)
8286		return r;
8287	return 0;
8288}
8289
8290static int pops(struct r600_shader_ctx *ctx, int pops)
8291{
8292	unsigned force_pop = ctx->bc->force_add_cf;
8293
8294	if (!force_pop) {
8295		int alu_pop = 3;
8296		if (ctx->bc->cf_last) {
8297			if (ctx->bc->cf_last->op == CF_OP_ALU)
8298				alu_pop = 0;
8299			else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
8300				alu_pop = 1;
8301		}
8302		alu_pop += pops;
8303		if (alu_pop == 1) {
8304			ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
8305			ctx->bc->force_add_cf = 1;
8306		} else if (alu_pop == 2) {
8307			ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
8308			ctx->bc->force_add_cf = 1;
8309		} else {
8310			force_pop = 1;
8311		}
8312	}
8313
8314	if (force_pop) {
8315		r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
8316		ctx->bc->cf_last->pop_count = pops;
8317		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
8318	}
8319
8320	return 0;
8321}
8322
8323static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
8324                                              unsigned reason)
8325{
8326	struct r600_stack_info *stack = &ctx->bc->stack;
8327	unsigned elements, entries;
8328
8329	unsigned entry_size = stack->entry_size;
8330
8331	elements = (stack->loop + stack->push_wqm ) * entry_size;
8332	elements += stack->push;
8333
8334	switch (ctx->bc->chip_class) {
8335	case R600:
8336	case R700:
8337		/* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
8338		 * the stack must be reserved to hold the current active/continue
8339		 * masks */
8340		if (reason == FC_PUSH_VPM) {
8341			elements += 2;
8342		}
8343		break;
8344
8345	case CAYMAN:
8346		/* r9xx: any stack operation on empty stack consumes 2 additional
8347		 * elements */
8348		elements += 2;
8349
8350		/* fallthrough */
8351		/* FIXME: do the two elements added above cover the cases for the
8352		 * r8xx+ below? */
8353
8354	case EVERGREEN:
8355		/* r8xx+: 2 extra elements are not always required, but one extra
8356		 * element must be added for each of the following cases:
8357		 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
8358		 *    stack usage.
8359		 *    (Currently we don't use ALU_ELSE_AFTER.)
8360		 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
8361		 *    PUSH instruction executed.
8362		 *
8363		 *    NOTE: it seems we also need to reserve additional element in some
8364		 *    other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
8365		 *    then STACK_SIZE should be 2 instead of 1 */
8366		if (reason == FC_PUSH_VPM) {
8367			elements += 1;
8368		}
8369		break;
8370
8371	default:
8372		assert(0);
8373		break;
8374	}
8375
8376	/* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
8377	 * for all chips, so we use 4 in the final formula, not the real entry_size
8378	 * for the chip */
8379	entry_size = 4;
8380
8381	entries = (elements + (entry_size - 1)) / entry_size;
8382
8383	if (entries > stack->max_entries)
8384		stack->max_entries = entries;
8385}
8386
8387static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
8388{
8389	switch(reason) {
8390	case FC_PUSH_VPM:
8391		--ctx->bc->stack.push;
8392		assert(ctx->bc->stack.push >= 0);
8393		break;
8394	case FC_PUSH_WQM:
8395		--ctx->bc->stack.push_wqm;
8396		assert(ctx->bc->stack.push_wqm >= 0);
8397		break;
8398	case FC_LOOP:
8399		--ctx->bc->stack.loop;
8400		assert(ctx->bc->stack.loop >= 0);
8401		break;
8402	default:
8403		assert(0);
8404		break;
8405	}
8406}
8407
8408static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
8409{
8410	switch (reason) {
8411	case FC_PUSH_VPM:
8412		++ctx->bc->stack.push;
8413		break;
8414	case FC_PUSH_WQM:
8415		++ctx->bc->stack.push_wqm;
8416	case FC_LOOP:
8417		++ctx->bc->stack.loop;
8418		break;
8419	default:
8420		assert(0);
8421	}
8422
8423	callstack_update_max_depth(ctx, reason);
8424}
8425
8426static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
8427{
8428	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
8429
8430	sp->mid = realloc((void *)sp->mid,
8431						sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
8432	sp->mid[sp->num_mid] = ctx->bc->cf_last;
8433	sp->num_mid++;
8434}
8435
8436static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
8437{
8438	ctx->bc->fc_sp++;
8439	ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
8440	ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
8441}
8442
8443static void fc_poplevel(struct r600_shader_ctx *ctx)
8444{
8445	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
8446	free(sp->mid);
8447	sp->mid = NULL;
8448	sp->num_mid = 0;
8449	sp->start = NULL;
8450	sp->type = 0;
8451	ctx->bc->fc_sp--;
8452}
8453
8454#if 0
8455static int emit_return(struct r600_shader_ctx *ctx)
8456{
8457	r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
8458	return 0;
8459}
8460
8461static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
8462{
8463
8464	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
8465	ctx->bc->cf_last->pop_count = pops;
8466	/* XXX work out offset */
8467	return 0;
8468}
8469
8470static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
8471{
8472	return 0;
8473}
8474
8475static void emit_testflag(struct r600_shader_ctx *ctx)
8476{
8477
8478}
8479
8480static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
8481{
8482	emit_testflag(ctx);
8483	emit_jump_to_offset(ctx, 1, 4);
8484	emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
8485	pops(ctx, ifidx + 1);
8486	emit_return(ctx);
8487}
8488
8489static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
8490{
8491	emit_testflag(ctx);
8492
8493	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
8494	ctx->bc->cf_last->pop_count = 1;
8495
8496	fc_set_mid(ctx, fc_sp);
8497
8498	pops(ctx, 1);
8499}
8500#endif
8501
8502static int emit_if(struct r600_shader_ctx *ctx, int opcode)
8503{
8504	int alu_type = CF_OP_ALU_PUSH_BEFORE;
8505
8506	/* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
8507	 * LOOP_STARTxxx for nested loops may put the branch stack into a state
8508	 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
8509	 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
8510	if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) {
8511		r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
8512		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
8513		alu_type = CF_OP_ALU;
8514	}
8515
8516	emit_logic_pred(ctx, opcode, alu_type);
8517
8518	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
8519
8520	fc_pushlevel(ctx, FC_IF);
8521
8522	callstack_push(ctx, FC_PUSH_VPM);
8523	return 0;
8524}
8525
8526static int tgsi_if(struct r600_shader_ctx *ctx)
8527{
8528	return emit_if(ctx, ALU_OP2_PRED_SETNE);
8529}
8530
8531static int tgsi_uif(struct r600_shader_ctx *ctx)
8532{
8533	return emit_if(ctx, ALU_OP2_PRED_SETNE_INT);
8534}
8535
8536static int tgsi_else(struct r600_shader_ctx *ctx)
8537{
8538	r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
8539	ctx->bc->cf_last->pop_count = 1;
8540
8541	fc_set_mid(ctx, ctx->bc->fc_sp);
8542	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
8543	return 0;
8544}
8545
8546static int tgsi_endif(struct r600_shader_ctx *ctx)
8547{
8548	pops(ctx, 1);
8549	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
8550		R600_ERR("if/endif unbalanced in shader\n");
8551		return -1;
8552	}
8553
8554	if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
8555		ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
8556		ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
8557	} else {
8558		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
8559	}
8560	fc_poplevel(ctx);
8561
8562	callstack_pop(ctx, FC_PUSH_VPM);
8563	return 0;
8564}
8565
8566static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
8567{
8568	/* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
8569	 * limited to 4096 iterations, like the other LOOP_* instructions. */
8570	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
8571
8572	fc_pushlevel(ctx, FC_LOOP);
8573
8574	/* check stack depth */
8575	callstack_push(ctx, FC_LOOP);
8576	return 0;
8577}
8578
8579static int tgsi_endloop(struct r600_shader_ctx *ctx)
8580{
8581	int i;
8582
8583	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
8584
8585	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
8586		R600_ERR("loop/endloop in shader code are not paired.\n");
8587		return -EINVAL;
8588	}
8589
8590	/* fixup loop pointers - from r600isa
8591	   LOOP END points to CF after LOOP START,
8592	   LOOP START point to CF after LOOP END
8593	   BRK/CONT point to LOOP END CF
8594	*/
8595	ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
8596
8597	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
8598
8599	for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
8600		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
8601	}
8602	/* XXX add LOOPRET support */
8603	fc_poplevel(ctx);
8604	callstack_pop(ctx, FC_LOOP);
8605	return 0;
8606}
8607
8608static int tgsi_loop_breakc(struct r600_shader_ctx *ctx)
8609{
8610	int r;
8611	unsigned int fscp;
8612
8613	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
8614	{
8615		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
8616			break;
8617	}
8618	if (fscp == 0) {
8619		R600_ERR("BREAKC not inside loop/endloop pair\n");
8620		return -EINVAL;
8621	}
8622
8623	if (ctx->bc->chip_class == EVERGREEN &&
8624	    ctx->bc->family != CHIP_CYPRESS &&
8625	    ctx->bc->family != CHIP_JUNIPER) {
8626		/* HW bug: ALU_BREAK does not save the active mask correctly */
8627		r = tgsi_uif(ctx);
8628		if (r)
8629			return r;
8630
8631		r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_BREAK);
8632		if (r)
8633			return r;
8634		fc_set_mid(ctx, fscp);
8635
8636		return tgsi_endif(ctx);
8637	} else {
8638		r = emit_logic_pred(ctx, ALU_OP2_PRED_SETE_INT, CF_OP_ALU_BREAK);
8639		if (r)
8640			return r;
8641		fc_set_mid(ctx, fscp);
8642	}
8643
8644	return 0;
8645}
8646
8647static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
8648{
8649	unsigned int fscp;
8650
8651	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
8652	{
8653		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
8654			break;
8655	}
8656
8657	if (fscp == 0) {
8658		R600_ERR("Break not inside loop/endloop pair\n");
8659		return -EINVAL;
8660	}
8661
8662	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
8663
8664	fc_set_mid(ctx, fscp);
8665
8666	return 0;
8667}
8668
8669static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
8670{
8671	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8672	int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
8673	int r;
8674
8675	if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
8676		emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
8677
8678	r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
8679	if (!r) {
8680		ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
8681		return emit_inc_ring_offset(ctx, stream, TRUE);
8682	}
8683	return r;
8684}
8685
8686static int tgsi_umad(struct r600_shader_ctx *ctx)
8687{
8688	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8689	struct r600_bytecode_alu alu;
8690	int i, j, k, r;
8691	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8692
8693	/* src0 * src1 */
8694	for (i = 0; i < lasti + 1; i++) {
8695		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8696			continue;
8697
8698		if (ctx->bc->chip_class == CAYMAN) {
8699			for (j = 0 ; j < 4; j++) {
8700				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8701
8702				alu.op = ALU_OP2_MULLO_UINT;
8703				for (k = 0; k < inst->Instruction.NumSrcRegs; k++) {
8704					r600_bytecode_src(&alu.src[k], &ctx->src[k], i);
8705				}
8706				alu.dst.chan = j;
8707				alu.dst.sel = ctx->temp_reg;
8708				alu.dst.write = (j == i);
8709				if (j == 3)
8710					alu.last = 1;
8711				r = r600_bytecode_add_alu(ctx->bc, &alu);
8712				if (r)
8713					return r;
8714			}
8715		} else {
8716			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8717
8718			alu.dst.chan = i;
8719			alu.dst.sel = ctx->temp_reg;
8720			alu.dst.write = 1;
8721
8722			alu.op = ALU_OP2_MULLO_UINT;
8723			for (j = 0; j < 2; j++) {
8724				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
8725			}
8726
8727			alu.last = 1;
8728			r = r600_bytecode_add_alu(ctx->bc, &alu);
8729			if (r)
8730				return r;
8731		}
8732	}
8733
8734
8735	for (i = 0; i < lasti + 1; i++) {
8736		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8737			continue;
8738
8739		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8740		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8741
8742		alu.op = ALU_OP2_ADD_INT;
8743
8744		alu.src[0].sel = ctx->temp_reg;
8745		alu.src[0].chan = i;
8746
8747		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
8748		if (i == lasti) {
8749			alu.last = 1;
8750		}
8751		r = r600_bytecode_add_alu(ctx->bc, &alu);
8752		if (r)
8753			return r;
8754	}
8755	return 0;
8756}
8757
8758static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
8759	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_r600_arl},
8760	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
8761	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
8762
8763	/* XXX:
8764	 * For state trackers other than OpenGL, we'll want to use
8765	 * _RECIP_IEEE instead.
8766	 */
8767	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_CLAMPED, tgsi_trans_srcx_replicate},
8768
8769	[TGSI_OPCODE_RSQ]	= { ALU_OP0_NOP, tgsi_rsq},
8770	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
8771	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
8772	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL, tgsi_op2},
8773	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
8774	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4, tgsi_dp},
8775	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4, tgsi_dp},
8776	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
8777	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN, tgsi_op2},
8778	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX, tgsi_op2},
8779	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
8780	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
8781	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD, tgsi_op3},
8782	[TGSI_OPCODE_SUB]	= { ALU_OP2_ADD, tgsi_op2},
8783	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
8784	[TGSI_OPCODE_FMA]	= { ALU_OP0_NOP, tgsi_unsupported},
8785	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
8786	[TGSI_OPCODE_DP2A]	= { ALU_OP0_NOP, tgsi_unsupported},
8787	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
8788	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
8789	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
8790	[TGSI_OPCODE_CLAMP]	= { ALU_OP0_NOP, tgsi_unsupported},
8791	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
8792	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
8793	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
8794	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
8795	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
8796	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
8797	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
8798	[TGSI_OPCODE_ABS]	= { ALU_OP1_MOV, tgsi_op2},
8799	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
8800	[TGSI_OPCODE_DPH]	= { ALU_OP2_DOT4, tgsi_dp},
8801	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
8802	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8803	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8804	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
8805	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_unsupported},
8806	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
8807	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
8808	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
8809	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
8810	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
8811	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
8812	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
8813	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, tgsi_trig},
8814	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
8815	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
8816	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
8817	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
8818	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
8819	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
8820	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_unsupported},
8821	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
8822	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
8823	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
8824	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
8825	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
8826	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_r600_arl},
8827	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
8828	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
8829	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
8830	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
8831	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
8832	[TGSI_OPCODE_SCS]	= { ALU_OP0_NOP, tgsi_scs},
8833	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
8834	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
8835	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
8836	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4, tgsi_dp},
8837	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
8838	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
8839	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
8840	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
8841	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
8842	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
8843	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
8844	[TGSI_OPCODE_DDX_FINE]	= { ALU_OP0_NOP, tgsi_unsupported},
8845	[TGSI_OPCODE_DDY_FINE]	= { ALU_OP0_NOP, tgsi_unsupported},
8846	[TGSI_OPCODE_PUSHA]	= { ALU_OP0_NOP, tgsi_unsupported},
8847	[TGSI_OPCODE_POPA]	= { ALU_OP0_NOP, tgsi_unsupported},
8848	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
8849	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
8850	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
8851	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
8852	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2_trans},
8853	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
8854	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
8855	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
8856	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
8857	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
8858	[TGSI_OPCODE_SAD]	= { ALU_OP0_NOP, tgsi_unsupported},
8859	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
8860	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8861	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
8862	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
8863	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
8864	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
8865	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
8866	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
8867	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
8868	[TGSI_OPCODE_TXQ_LZ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8869	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
8870	[105]			= { ALU_OP0_NOP, tgsi_unsupported},
8871	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
8872	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
8873	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
8874	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
8875	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
8876	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
8877	[112]			= { ALU_OP0_NOP, tgsi_unsupported},
8878	[TGSI_OPCODE_CALLNZ]	= { ALU_OP0_NOP, tgsi_unsupported},
8879	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
8880	[TGSI_OPCODE_BREAKC]	= { ALU_OP0_NOP, tgsi_loop_breakc},
8881	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
8882	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
8883	[118]			= { ALU_OP0_NOP, tgsi_unsupported},
8884	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
8885	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
8886	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
8887	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
8888	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
8889	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
8890	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2_trans},
8891	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
8892	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
8893	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
8894	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
8895	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
8896	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
8897	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
8898	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
8899	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
8900	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
8901	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
8902	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
8903	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2_trans},
8904	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
8905	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2_swap},
8906	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
8907	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
8908	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
8909	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
8910	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
8911	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
8912	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
8913	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
8914	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
8915	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
8916	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
8917	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
8918	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
8919	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
8920	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
8921	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
8922	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_r600_arl},
8923	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
8924	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
8925	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
8926	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
8927	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
8928	[TGSI_OPCODE_MFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8929	[TGSI_OPCODE_LFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8930	[TGSI_OPCODE_SFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8931	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_NOP, tgsi_unsupported},
8932	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
8933	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
8934	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
8935	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
8936	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
8937	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
8938	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
8939	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
8940	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
8941	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
8942	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
8943	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
8944	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
8945	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, tgsi_op2_trans},
8946	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
8947	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_unsupported},
8948	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_unsupported},
8949	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_unsupported},
8950	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_unsupported},
8951	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_unsupported},
8952	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_unsupported},
8953	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_unsupported},
8954	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_unsupported},
8955	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_unsupported},
8956	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_unsupported},
8957	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_unsupported},
8958	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_unsupported},
8959	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_unsupported},
8960	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
8961};
8962
8963static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
8964	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_eg_arl},
8965	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
8966	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
8967	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
8968	[TGSI_OPCODE_RSQ]	= { ALU_OP1_RECIPSQRT_IEEE, tgsi_rsq},
8969	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
8970	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
8971	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL, tgsi_op2},
8972	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
8973	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4, tgsi_dp},
8974	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4, tgsi_dp},
8975	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
8976	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN, tgsi_op2},
8977	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX, tgsi_op2},
8978	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
8979	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
8980	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD, tgsi_op3},
8981	[TGSI_OPCODE_SUB]	= { ALU_OP2_ADD, tgsi_op2},
8982	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
8983	[TGSI_OPCODE_FMA]	= { ALU_OP0_NOP, tgsi_unsupported},
8984	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
8985	[TGSI_OPCODE_DP2A]	= { ALU_OP0_NOP, tgsi_unsupported},
8986	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
8987	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
8988	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
8989	[TGSI_OPCODE_CLAMP]	= { ALU_OP0_NOP, tgsi_unsupported},
8990	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
8991	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
8992	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
8993	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
8994	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
8995	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
8996	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
8997	[TGSI_OPCODE_ABS]	= { ALU_OP1_MOV, tgsi_op2},
8998	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
8999	[TGSI_OPCODE_DPH]	= { ALU_OP2_DOT4, tgsi_dp},
9000	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
9001	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
9002	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
9003	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
9004	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_unsupported},
9005	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
9006	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
9007	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
9008	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
9009	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
9010	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
9011	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
9012	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, tgsi_trig},
9013	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
9014	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
9015	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
9016	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
9017	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
9018	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
9019	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_unsupported},
9020	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
9021	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
9022	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
9023	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
9024	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
9025	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_eg_arl},
9026	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
9027	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
9028	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
9029	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
9030	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
9031	[TGSI_OPCODE_SCS]	= { ALU_OP0_NOP, tgsi_scs},
9032	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
9033	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
9034	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
9035	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4, tgsi_dp},
9036	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
9037	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
9038	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
9039	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
9040	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
9041	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
9042	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
9043	[TGSI_OPCODE_DDX_FINE]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
9044	[TGSI_OPCODE_DDY_FINE]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
9045	[TGSI_OPCODE_PUSHA]	= { ALU_OP0_NOP, tgsi_unsupported},
9046	[TGSI_OPCODE_POPA]	= { ALU_OP0_NOP, tgsi_unsupported},
9047	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
9048	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
9049	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
9050	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
9051	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2},
9052	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
9053	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
9054	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
9055	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
9056	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
9057	[TGSI_OPCODE_SAD]	= { ALU_OP0_NOP, tgsi_unsupported},
9058	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
9059	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
9060	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
9061	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
9062	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
9063	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
9064	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
9065	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
9066	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
9067	[TGSI_OPCODE_TXQ_LZ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
9068	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
9069	[105]			= { ALU_OP0_NOP, tgsi_unsupported},
9070	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
9071	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
9072	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
9073	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
9074	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
9075	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
9076	[112]			= { ALU_OP0_NOP, tgsi_unsupported},
9077	[TGSI_OPCODE_CALLNZ]	= { ALU_OP0_NOP, tgsi_unsupported},
9078	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
9079	[TGSI_OPCODE_BREAKC]	= { ALU_OP0_NOP, tgsi_unsupported},
9080	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
9081	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
9082	[118]			= { ALU_OP0_NOP, tgsi_unsupported},
9083	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_f2i},
9084	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
9085	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
9086	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
9087	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
9088	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
9089	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2},
9090	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
9091	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
9092	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
9093	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
9094	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
9095	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
9096	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
9097	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
9098	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
9099	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
9100	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
9101	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
9102	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2},
9103	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
9104	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2},
9105	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
9106	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
9107	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
9108	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
9109	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
9110	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
9111	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
9112	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
9113	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
9114	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
9115	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
9116	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
9117	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
9118	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
9119	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
9120	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
9121	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_eg_arl},
9122	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
9123	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
9124	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
9125	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
9126	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
9127	[TGSI_OPCODE_MFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
9128	[TGSI_OPCODE_LFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
9129	[TGSI_OPCODE_SFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
9130	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_NOP, tgsi_unsupported},
9131	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
9132	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
9133	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
9134	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
9135	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
9136	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
9137	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
9138	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
9139	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
9140	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
9141	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
9142	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
9143	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
9144	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, tgsi_op2_trans},
9145	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
9146	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_tex},
9147	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_tex},
9148	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_op3},
9149	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_op3},
9150	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_bfi},
9151	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_op2},
9152	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_op2},
9153	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_op2},
9154	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_msb},
9155	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_msb},
9156	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
9157	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
9158	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
9159	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
9160	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
9161	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
9162	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
9163	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
9164	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
9165	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
9166	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
9167	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
9168	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
9169	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
9170	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
9171	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
9172	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
9173	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
9174	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
9175	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
9176	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
9177	[TGSI_OPCODE_D2I]	= { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
9178	[TGSI_OPCODE_I2D]	= { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
9179	[TGSI_OPCODE_D2U]	= { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
9180	[TGSI_OPCODE_U2D]	= { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
9181	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
9182	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
9183};
9184
9185static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
9186	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_eg_arl},
9187	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
9188	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
9189	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
9190	[TGSI_OPCODE_RSQ]	= { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
9191	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
9192	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
9193	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL, tgsi_op2},
9194	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
9195	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4, tgsi_dp},
9196	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4, tgsi_dp},
9197	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
9198	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN, tgsi_op2},
9199	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX, tgsi_op2},
9200	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
9201	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
9202	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD, tgsi_op3},
9203	[TGSI_OPCODE_SUB]	= { ALU_OP2_ADD, tgsi_op2},
9204	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
9205	[TGSI_OPCODE_FMA]	= { ALU_OP0_NOP, tgsi_unsupported},
9206	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
9207	[TGSI_OPCODE_DP2A]	= { ALU_OP0_NOP, tgsi_unsupported},
9208	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
9209	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
9210	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
9211	[TGSI_OPCODE_CLAMP]	= { ALU_OP0_NOP, tgsi_unsupported},
9212	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
9213	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
9214	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
9215	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
9216	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, cayman_pow},
9217	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
9218	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
9219	[TGSI_OPCODE_ABS]	= { ALU_OP1_MOV, tgsi_op2},
9220	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
9221	[TGSI_OPCODE_DPH]	= { ALU_OP2_DOT4, tgsi_dp},
9222	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, cayman_trig},
9223	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
9224	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
9225	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
9226	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_unsupported},
9227	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
9228	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
9229	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
9230	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
9231	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
9232	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
9233	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
9234	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, cayman_trig},
9235	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
9236	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
9237	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
9238	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
9239	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
9240	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
9241	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_unsupported},
9242	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
9243	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
9244	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
9245	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
9246	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
9247	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_eg_arl},
9248	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
9249	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
9250	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
9251	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
9252	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
9253	[TGSI_OPCODE_SCS]	= { ALU_OP0_NOP, tgsi_scs},
9254	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
9255	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
9256	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
9257	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4, tgsi_dp},
9258	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
9259	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
9260	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
9261	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
9262	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
9263	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
9264	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
9265	[TGSI_OPCODE_DDX_FINE]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
9266	[TGSI_OPCODE_DDY_FINE]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
9267	[TGSI_OPCODE_PUSHA]	= { ALU_OP0_NOP, tgsi_unsupported},
9268	[TGSI_OPCODE_POPA]	= { ALU_OP0_NOP, tgsi_unsupported},
9269	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
9270	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2},
9271	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
9272	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
9273	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2},
9274	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
9275	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
9276	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
9277	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
9278	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
9279	[TGSI_OPCODE_SAD]	= { ALU_OP0_NOP, tgsi_unsupported},
9280	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
9281	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
9282	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
9283	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
9284	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
9285	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
9286	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
9287	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
9288	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
9289	[TGSI_OPCODE_TXQ_LZ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
9290	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
9291	[105]			= { ALU_OP0_NOP, tgsi_unsupported},
9292	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
9293	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
9294	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
9295	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
9296	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
9297	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
9298	[112]			= { ALU_OP0_NOP, tgsi_unsupported},
9299	[TGSI_OPCODE_CALLNZ]	= { ALU_OP0_NOP, tgsi_unsupported},
9300	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
9301	[TGSI_OPCODE_BREAKC]	= { ALU_OP0_NOP, tgsi_unsupported},
9302	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
9303	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
9304	[118]			= { ALU_OP0_NOP, tgsi_unsupported},
9305	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_op2},
9306	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
9307	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
9308	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
9309	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
9310	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
9311	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2},
9312	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
9313	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_op2},
9314	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2},
9315	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
9316	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
9317	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
9318	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
9319	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
9320	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
9321	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
9322	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
9323	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
9324	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2},
9325	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
9326	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2},
9327	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
9328	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
9329	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
9330	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
9331	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
9332	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
9333	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
9334	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
9335	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
9336	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
9337	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
9338	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
9339	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
9340	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
9341	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
9342	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
9343	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_eg_arl},
9344	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
9345	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
9346	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
9347	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
9348	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
9349	[TGSI_OPCODE_MFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
9350	[TGSI_OPCODE_LFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
9351	[TGSI_OPCODE_SFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
9352	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_NOP, tgsi_unsupported},
9353	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
9354	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
9355	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
9356	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
9357	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
9358	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
9359	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
9360	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
9361	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
9362	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
9363	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
9364	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
9365	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
9366	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
9367	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
9368	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_tex},
9369	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_tex},
9370	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_op3},
9371	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_op3},
9372	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_bfi},
9373	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_op2},
9374	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_op2},
9375	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_op2},
9376	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_msb},
9377	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_msb},
9378	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
9379	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
9380	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
9381	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
9382	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
9383	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
9384	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
9385	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
9386	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
9387	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
9388	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
9389	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
9390	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
9391	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
9392	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
9393	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
9394	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
9395	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
9396	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
9397	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
9398	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
9399	[TGSI_OPCODE_D2I]	= { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
9400	[TGSI_OPCODE_I2D]	= { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
9401	[TGSI_OPCODE_D2U]	= { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
9402	[TGSI_OPCODE_U2D]	= { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
9403	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
9404	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
9405};
9406