r600_shader.c revision 2a9639e41fdcecb489e39f739e4d42e6a78655f3
1/*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23#include "r600_sq.h"
24#include "r600_llvm.h"
25#include "r600_formats.h"
26#include "r600_opcodes.h"
27#include "r600_shader.h"
28#include "r600d.h"
29
30#include "sb/sb_public.h"
31
32#include "pipe/p_shader_tokens.h"
33#include "tgsi/tgsi_info.h"
34#include "tgsi/tgsi_parse.h"
35#include "tgsi/tgsi_scan.h"
36#include "tgsi/tgsi_dump.h"
37#include "util/u_memory.h"
38#include "util/u_math.h"
39#include <stdio.h>
40#include <errno.h>
41
42/* CAYMAN notes
43Why CAYMAN got loops for lots of instructions is explained here.
44
45-These 8xx t-slot only ops are implemented in all vector slots.
46MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
47These 8xx t-slot only opcodes become vector ops, with all four
48slots expecting the arguments on sources a and b. Result is
49broadcast to all channels.
50MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
51These 8xx t-slot only opcodes become vector ops in the z, y, and
52x slots.
53EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
54RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
55SQRT_IEEE/_64
56SIN/COS
57The w slot may have an independent co-issued operation, or if the
58result is required to be in the w slot, the opcode above may be
59issued in the w slot as well.
60The compiler must issue the source argument to slots z, y, and x
61*/
62
63#define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
64static int r600_shader_from_tgsi(struct r600_context *rctx,
65				 struct r600_pipe_shader *pipeshader,
66				 union r600_shader_key key);
67
68
69static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
70                           int size, unsigned comp_mask) {
71
72	if (!size)
73		return;
74
75	if (ps->num_arrays == ps->max_arrays) {
76		ps->max_arrays += 64;
77		ps->arrays = realloc(ps->arrays, ps->max_arrays *
78		                     sizeof(struct r600_shader_array));
79	}
80
81	int n = ps->num_arrays;
82	++ps->num_arrays;
83
84	ps->arrays[n].comp_mask = comp_mask;
85	ps->arrays[n].gpr_start = start_gpr;
86	ps->arrays[n].gpr_count = size;
87}
88
89static void r600_dump_streamout(struct pipe_stream_output_info *so)
90{
91	unsigned i;
92
93	fprintf(stderr, "STREAMOUT\n");
94	for (i = 0; i < so->num_outputs; i++) {
95		unsigned mask = ((1 << so->output[i].num_components) - 1) <<
96				so->output[i].start_component;
97		fprintf(stderr, "  %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
98			i,
99			so->output[i].stream,
100			so->output[i].output_buffer,
101			so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
102			so->output[i].register_index,
103			mask & 1 ? "x" : "",
104		        mask & 2 ? "y" : "",
105		        mask & 4 ? "z" : "",
106		        mask & 8 ? "w" : "",
107			so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
108	}
109}
110
111static int store_shader(struct pipe_context *ctx,
112			struct r600_pipe_shader *shader)
113{
114	struct r600_context *rctx = (struct r600_context *)ctx;
115	uint32_t *ptr, i;
116
117	if (shader->bo == NULL) {
118		shader->bo = (struct r600_resource*)
119			pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
120		if (shader->bo == NULL) {
121			return -ENOMEM;
122		}
123		ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE);
124		if (R600_BIG_ENDIAN) {
125			for (i = 0; i < shader->shader.bc.ndw; ++i) {
126				ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
127			}
128		} else {
129			memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
130		}
131		rctx->b.ws->buffer_unmap(shader->bo->cs_buf);
132	}
133
134	return 0;
135}
136
137int r600_pipe_shader_create(struct pipe_context *ctx,
138			    struct r600_pipe_shader *shader,
139			    union r600_shader_key key)
140{
141	struct r600_context *rctx = (struct r600_context *)ctx;
142	struct r600_pipe_shader_selector *sel = shader->selector;
143	int r;
144	bool dump = r600_can_dump_shader(&rctx->screen->b, sel->tokens);
145	unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
146	unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
147	unsigned export_shader;
148
149	shader->shader.bc.isa = rctx->isa;
150
151	if (dump) {
152		fprintf(stderr, "--------------------------------------------------------------\n");
153		tgsi_dump(sel->tokens, 0);
154
155		if (sel->so.num_outputs) {
156			r600_dump_streamout(&sel->so);
157		}
158	}
159	r = r600_shader_from_tgsi(rctx, shader, key);
160	if (r) {
161		R600_ERR("translation from TGSI failed !\n");
162		goto error;
163	}
164	if (shader->shader.processor_type == TGSI_PROCESSOR_VERTEX) {
165		/* only disable for vertex shaders in tess paths */
166		if (key.vs.as_ls)
167			use_sb = 0;
168	}
169	use_sb &= (shader->shader.processor_type != TGSI_PROCESSOR_TESS_CTRL);
170	use_sb &= (shader->shader.processor_type != TGSI_PROCESSOR_TESS_EVAL);
171
172	/* disable SB for shaders using doubles */
173	use_sb &= !shader->shader.uses_doubles;
174
175	/* Check if the bytecode has already been built.  When using the llvm
176	 * backend, r600_shader_from_tgsi() will take care of building the
177	 * bytecode.
178	 */
179	if (!shader->shader.bc.bytecode) {
180		r = r600_bytecode_build(&shader->shader.bc);
181		if (r) {
182			R600_ERR("building bytecode failed !\n");
183			goto error;
184		}
185	}
186
187	if (dump && !sb_disasm) {
188		fprintf(stderr, "--------------------------------------------------------------\n");
189		r600_bytecode_disasm(&shader->shader.bc);
190		fprintf(stderr, "______________________________________________________________\n");
191	} else if ((dump && sb_disasm) || use_sb) {
192		r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
193		                             dump, use_sb);
194		if (r) {
195			R600_ERR("r600_sb_bytecode_process failed !\n");
196			goto error;
197		}
198	}
199
200	if (shader->gs_copy_shader) {
201		if (dump) {
202			// dump copy shader
203			r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
204						     &shader->gs_copy_shader->shader, dump, 0);
205			if (r)
206				goto error;
207		}
208
209		if ((r = store_shader(ctx, shader->gs_copy_shader)))
210			goto error;
211	}
212
213	/* Store the shader in a buffer. */
214	if ((r = store_shader(ctx, shader)))
215		goto error;
216
217	/* Build state. */
218	switch (shader->shader.processor_type) {
219	case TGSI_PROCESSOR_TESS_CTRL:
220		evergreen_update_hs_state(ctx, shader);
221		break;
222	case TGSI_PROCESSOR_TESS_EVAL:
223		if (key.tes.as_es)
224			evergreen_update_es_state(ctx, shader);
225		else
226			evergreen_update_vs_state(ctx, shader);
227		break;
228	case TGSI_PROCESSOR_GEOMETRY:
229		if (rctx->b.chip_class >= EVERGREEN) {
230			evergreen_update_gs_state(ctx, shader);
231			evergreen_update_vs_state(ctx, shader->gs_copy_shader);
232		} else {
233			r600_update_gs_state(ctx, shader);
234			r600_update_vs_state(ctx, shader->gs_copy_shader);
235		}
236		break;
237	case TGSI_PROCESSOR_VERTEX:
238		export_shader = key.vs.as_es;
239		if (rctx->b.chip_class >= EVERGREEN) {
240			if (key.vs.as_ls)
241				evergreen_update_ls_state(ctx, shader);
242			else if (key.vs.as_es)
243				evergreen_update_es_state(ctx, shader);
244			else
245				evergreen_update_vs_state(ctx, shader);
246		} else {
247			if (export_shader)
248				r600_update_es_state(ctx, shader);
249			else
250				r600_update_vs_state(ctx, shader);
251		}
252		break;
253	case TGSI_PROCESSOR_FRAGMENT:
254		if (rctx->b.chip_class >= EVERGREEN) {
255			evergreen_update_ps_state(ctx, shader);
256		} else {
257			r600_update_ps_state(ctx, shader);
258		}
259		break;
260	default:
261		r = -EINVAL;
262		goto error;
263	}
264	return 0;
265
266error:
267	r600_pipe_shader_destroy(ctx, shader);
268	return r;
269}
270
271void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
272{
273	pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL);
274	r600_bytecode_clear(&shader->shader.bc);
275	r600_release_command_buffer(&shader->command_buffer);
276}
277
278/*
279 * tgsi -> r600 shader
280 */
281struct r600_shader_tgsi_instruction;
282
283struct r600_shader_src {
284	unsigned				sel;
285	unsigned				swizzle[4];
286	unsigned				neg;
287	unsigned				abs;
288	unsigned				rel;
289	unsigned				kc_bank;
290	boolean					kc_rel; /* true if cache bank is indexed */
291	uint32_t				value[4];
292};
293
294struct eg_interp {
295	boolean					enabled;
296	unsigned				ij_index;
297};
298
299struct r600_shader_ctx {
300	struct tgsi_shader_info			info;
301	struct tgsi_parse_context		parse;
302	const struct tgsi_token			*tokens;
303	unsigned				type;
304	unsigned				file_offset[TGSI_FILE_COUNT];
305	unsigned				temp_reg;
306	const struct r600_shader_tgsi_instruction	*inst_info;
307	struct r600_bytecode			*bc;
308	struct r600_shader			*shader;
309	struct r600_shader_src			src[4];
310	uint32_t				*literals;
311	uint32_t				nliterals;
312	uint32_t				max_driver_temp_used;
313	boolean use_llvm;
314	/* needed for evergreen interpolation */
315	struct eg_interp		eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
316	/* evergreen/cayman also store sample mask in face register */
317	int					face_gpr;
318	/* sample id is .w component stored in fixed point position register */
319	int					fixed_pt_position_gpr;
320	int					colors_used;
321	boolean                 clip_vertex_write;
322	unsigned                cv_output;
323	unsigned		edgeflag_output;
324	int					fragcoord_input;
325	int					native_integers;
326	int					next_ring_offset;
327	int					gs_out_ring_offset;
328	int					gs_next_vertex;
329	struct r600_shader	*gs_for_vs;
330	int					gs_export_gpr_tregs[4];
331	const struct pipe_stream_output_info	*gs_stream_output_info;
332	unsigned				enabled_stream_buffers_mask;
333	unsigned                                tess_input_info; /* temp with tess input offsets */
334	unsigned                                tess_output_info; /* temp with tess input offsets */
335};
336
337struct r600_shader_tgsi_instruction {
338	unsigned	op;
339	int (*process)(struct r600_shader_ctx *ctx);
340};
341
342static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
343static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
344static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
345static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
346static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
347static int tgsi_else(struct r600_shader_ctx *ctx);
348static int tgsi_endif(struct r600_shader_ctx *ctx);
349static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
350static int tgsi_endloop(struct r600_shader_ctx *ctx);
351static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
352static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
353                                unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
354                                unsigned int dst_reg);
355static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
356			const struct r600_shader_src *shader_src,
357			unsigned chan);
358
359static int tgsi_last_instruction(unsigned writemask)
360{
361	int i, lasti = 0;
362
363	for (i = 0; i < 4; i++) {
364		if (writemask & (1 << i)) {
365			lasti = i;
366		}
367	}
368	return lasti;
369}
370
371static int tgsi_is_supported(struct r600_shader_ctx *ctx)
372{
373	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
374	int j;
375
376	if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
377		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
378		return -EINVAL;
379	}
380	if (i->Instruction.Predicate) {
381		R600_ERR("predicate unsupported\n");
382		return -EINVAL;
383	}
384#if 0
385	if (i->Instruction.Label) {
386		R600_ERR("label unsupported\n");
387		return -EINVAL;
388	}
389#endif
390	for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
391		if (i->Src[j].Register.Dimension) {
392		   switch (i->Src[j].Register.File) {
393		   case TGSI_FILE_CONSTANT:
394			   break;
395		   case TGSI_FILE_INPUT:
396			   if (ctx->type == TGSI_PROCESSOR_GEOMETRY)
397				   break;
398		   default:
399			   R600_ERR("unsupported src %d (dimension %d)\n", j,
400				    i->Src[j].Register.Dimension);
401			   return -EINVAL;
402		   }
403		}
404	}
405	for (j = 0; j < i->Instruction.NumDstRegs; j++) {
406		if (i->Dst[j].Register.Dimension) {
407			R600_ERR("unsupported dst (dimension)\n");
408			return -EINVAL;
409		}
410	}
411	return 0;
412}
413
414int eg_get_interpolator_index(unsigned interpolate, unsigned location)
415{
416	if (interpolate == TGSI_INTERPOLATE_COLOR ||
417		interpolate == TGSI_INTERPOLATE_LINEAR ||
418		interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
419	{
420		int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
421		int loc;
422
423		switch(location) {
424		case TGSI_INTERPOLATE_LOC_CENTER:
425			loc = 1;
426			break;
427		case TGSI_INTERPOLATE_LOC_CENTROID:
428			loc = 2;
429			break;
430		case TGSI_INTERPOLATE_LOC_SAMPLE:
431		default:
432			loc = 0; break;
433		}
434
435		return is_linear * 3 + loc;
436	}
437
438	return -1;
439}
440
441static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
442		int input)
443{
444	int i = eg_get_interpolator_index(
445		ctx->shader->input[input].interpolate,
446		ctx->shader->input[input].interpolate_location);
447	assert(i >= 0);
448	ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
449}
450
451static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
452{
453	int i, r;
454	struct r600_bytecode_alu alu;
455	int gpr = 0, base_chan = 0;
456	int ij_index = ctx->shader->input[input].ij_index;
457
458	/* work out gpr and base_chan from index */
459	gpr = ij_index / 2;
460	base_chan = (2 * (ij_index % 2)) + 1;
461
462	for (i = 0; i < 8; i++) {
463		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
464
465		if (i < 4)
466			alu.op = ALU_OP2_INTERP_ZW;
467		else
468			alu.op = ALU_OP2_INTERP_XY;
469
470		if ((i > 1) && (i < 6)) {
471			alu.dst.sel = ctx->shader->input[input].gpr;
472			alu.dst.write = 1;
473		}
474
475		alu.dst.chan = i % 4;
476
477		alu.src[0].sel = gpr;
478		alu.src[0].chan = (base_chan - (i % 2));
479
480		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
481
482		alu.bank_swizzle_force = SQ_ALU_VEC_210;
483		if ((i % 4) == 3)
484			alu.last = 1;
485		r = r600_bytecode_add_alu(ctx->bc, &alu);
486		if (r)
487			return r;
488	}
489	return 0;
490}
491
492static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
493{
494	int i, r;
495	struct r600_bytecode_alu alu;
496
497	for (i = 0; i < 4; i++) {
498		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
499
500		alu.op = ALU_OP1_INTERP_LOAD_P0;
501
502		alu.dst.sel = ctx->shader->input[input].gpr;
503		alu.dst.write = 1;
504
505		alu.dst.chan = i;
506
507		alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
508		alu.src[0].chan = i;
509
510		if (i == 3)
511			alu.last = 1;
512		r = r600_bytecode_add_alu(ctx->bc, &alu);
513		if (r)
514			return r;
515	}
516	return 0;
517}
518
519/*
520 * Special export handling in shaders
521 *
522 * shader export ARRAY_BASE for EXPORT_POS:
523 * 60 is position
524 * 61 is misc vector
525 * 62, 63 are clip distance vectors
526 *
527 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
528 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
529 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
530 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
531 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
532 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
533 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
534 * exclusive from render target index)
535 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
536 *
537 *
538 * shader export ARRAY_BASE for EXPORT_PIXEL:
539 * 0-7 CB targets
540 * 61 computed Z vector
541 *
542 * The use of the values exported in the computed Z vector are controlled
543 * by DB_SHADER_CONTROL:
544 * Z_EXPORT_ENABLE - Z as a float in RED
545 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
546 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
547 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
548 * DB_SOURCE_FORMAT - export control restrictions
549 *
550 */
551
552
553/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
554static int r600_spi_sid(struct r600_shader_io * io)
555{
556	int index, name = io->name;
557
558	/* These params are handled differently, they don't need
559	 * semantic indices, so we'll use 0 for them.
560	 */
561	if (name == TGSI_SEMANTIC_POSITION ||
562	    name == TGSI_SEMANTIC_PSIZE ||
563	    name == TGSI_SEMANTIC_EDGEFLAG ||
564	    name == TGSI_SEMANTIC_FACE ||
565	    name == TGSI_SEMANTIC_SAMPLEMASK)
566		index = 0;
567	else {
568		if (name == TGSI_SEMANTIC_GENERIC) {
569			/* For generic params simply use sid from tgsi */
570			index = io->sid;
571		} else {
572			/* For non-generic params - pack name and sid into 8 bits */
573			index = 0x80 | (name<<3) | (io->sid);
574		}
575
576		/* Make sure that all really used indices have nonzero value, so
577		 * we can just compare it to 0 later instead of comparing the name
578		 * with different values to detect special cases. */
579		index++;
580	}
581
582	return index;
583};
584
585/* we need this to get a common lds index for vs/tcs/tes input/outputs */
586int r600_get_lds_unique_index(unsigned semantic_name, unsigned index)
587{
588	switch (semantic_name) {
589	case TGSI_SEMANTIC_POSITION:
590		return 0;
591	case TGSI_SEMANTIC_PSIZE:
592		return 1;
593	case TGSI_SEMANTIC_CLIPDIST:
594		assert(index <= 1);
595		return 2 + index;
596	case TGSI_SEMANTIC_GENERIC:
597		if (index <= 63-4)
598			return 4 + index - 9;
599		else
600			/* same explanation as in the default statement,
601			 * the only user hitting this is st/nine.
602			 */
603			return 0;
604
605	/* patch indices are completely separate and thus start from 0 */
606	case TGSI_SEMANTIC_TESSOUTER:
607		return 0;
608	case TGSI_SEMANTIC_TESSINNER:
609		return 1;
610	case TGSI_SEMANTIC_PATCH:
611		return 2 + index;
612
613	default:
614		/* Don't fail here. The result of this function is only used
615		 * for LS, TCS, TES, and GS, where legacy GL semantics can't
616		 * occur, but this function is called for all vertex shaders
617		 * before it's known whether LS will be compiled or not.
618		 */
619		return 0;
620	}
621}
622
623/* turn input into interpolate on EG */
624static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
625{
626	int r = 0;
627
628	if (ctx->shader->input[index].spi_sid) {
629		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
630		if (ctx->shader->input[index].interpolate > 0) {
631			evergreen_interp_assign_ij_index(ctx, index);
632			if (!ctx->use_llvm)
633				r = evergreen_interp_alu(ctx, index);
634		} else {
635			if (!ctx->use_llvm)
636				r = evergreen_interp_flat(ctx, index);
637		}
638	}
639	return r;
640}
641
642static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
643{
644	struct r600_bytecode_alu alu;
645	int i, r;
646	int gpr_front = ctx->shader->input[front].gpr;
647	int gpr_back = ctx->shader->input[back].gpr;
648
649	for (i = 0; i < 4; i++) {
650		memset(&alu, 0, sizeof(alu));
651		alu.op = ALU_OP3_CNDGT;
652		alu.is_op3 = 1;
653		alu.dst.write = 1;
654		alu.dst.sel = gpr_front;
655		alu.src[0].sel = ctx->face_gpr;
656		alu.src[1].sel = gpr_front;
657		alu.src[2].sel = gpr_back;
658
659		alu.dst.chan = i;
660		alu.src[1].chan = i;
661		alu.src[2].chan = i;
662		alu.last = (i==3);
663
664		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
665			return r;
666	}
667
668	return 0;
669}
670
671/* execute a single slot ALU calculation */
672static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
673			  int dst_sel, int dst_chan,
674			  int src0_sel, unsigned src0_chan_val,
675			  int src1_sel, unsigned src1_chan_val)
676{
677	struct r600_bytecode_alu alu;
678	int r, i;
679
680	if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) {
681		for (i = 0; i < 4; i++) {
682			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
683			alu.op = op;
684			alu.src[0].sel = src0_sel;
685			if (src0_sel == V_SQ_ALU_SRC_LITERAL)
686				alu.src[0].value = src0_chan_val;
687			else
688				alu.src[0].chan = src0_chan_val;
689			alu.src[1].sel = src1_sel;
690			if (src1_sel == V_SQ_ALU_SRC_LITERAL)
691				alu.src[1].value = src1_chan_val;
692			else
693				alu.src[1].chan = src1_chan_val;
694			alu.dst.sel = dst_sel;
695			alu.dst.chan = i;
696			alu.dst.write = i == dst_chan;
697			alu.last = (i == 3);
698			r = r600_bytecode_add_alu(ctx->bc, &alu);
699			if (r)
700				return r;
701		}
702		return 0;
703	}
704
705	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
706	alu.op = op;
707	alu.src[0].sel = src0_sel;
708	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
709		alu.src[0].value = src0_chan_val;
710	else
711		alu.src[0].chan = src0_chan_val;
712	alu.src[1].sel = src1_sel;
713	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
714		alu.src[1].value = src1_chan_val;
715	else
716		alu.src[1].chan = src1_chan_val;
717	alu.dst.sel = dst_sel;
718	alu.dst.chan = dst_chan;
719	alu.dst.write = 1;
720	alu.last = 1;
721	r = r600_bytecode_add_alu(ctx->bc, &alu);
722	if (r)
723		return r;
724	return 0;
725}
726
727/* execute a single slot ALU calculation */
728static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
729			  int dst_sel, int dst_chan,
730			  int src0_sel, unsigned src0_chan_val,
731			  int src1_sel, unsigned src1_chan_val,
732			  int src2_sel, unsigned src2_chan_val)
733{
734	struct r600_bytecode_alu alu;
735	int r;
736
737	/* validate this for other ops */
738	assert(op == ALU_OP3_MULADD_UINT24);
739	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
740	alu.op = op;
741	alu.src[0].sel = src0_sel;
742	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
743		alu.src[0].value = src0_chan_val;
744	else
745		alu.src[0].chan = src0_chan_val;
746	alu.src[1].sel = src1_sel;
747	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
748		alu.src[1].value = src1_chan_val;
749	else
750		alu.src[1].chan = src1_chan_val;
751	alu.src[2].sel = src2_sel;
752	if (src2_sel == V_SQ_ALU_SRC_LITERAL)
753		alu.src[2].value = src2_chan_val;
754	else
755		alu.src[2].chan = src2_chan_val;
756	alu.dst.sel = dst_sel;
757	alu.dst.chan = dst_chan;
758	alu.is_op3 = 1;
759	alu.last = 1;
760	r = r600_bytecode_add_alu(ctx->bc, &alu);
761	if (r)
762		return r;
763	return 0;
764}
765
766static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
767{
768	return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
769}
770
771static int r600_get_temp(struct r600_shader_ctx *ctx)
772{
773	return ctx->temp_reg + ctx->max_driver_temp_used++;
774}
775
776static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
777{
778	int i;
779	i = ctx->shader->noutput++;
780	ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
781	ctx->shader->output[i].sid = 0;
782	ctx->shader->output[i].gpr = 0;
783	ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
784	ctx->shader->output[i].write_mask = 0x4;
785	ctx->shader->output[i].spi_sid = prim_id_sid;
786
787	return 0;
788}
789
790static int tgsi_declaration(struct r600_shader_ctx *ctx)
791{
792	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
793	int r, i, j, count = d->Range.Last - d->Range.First + 1;
794
795	switch (d->Declaration.File) {
796	case TGSI_FILE_INPUT:
797		for (j = 0; j < count; j++) {
798			i = ctx->shader->ninput + j;
799			assert(i < Elements(ctx->shader->input));
800			ctx->shader->input[i].name = d->Semantic.Name;
801			ctx->shader->input[i].sid = d->Semantic.Index + j;
802			ctx->shader->input[i].interpolate = d->Interp.Interpolate;
803			ctx->shader->input[i].interpolate_location = d->Interp.Location;
804			ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
805			if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
806				ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
807				switch (ctx->shader->input[i].name) {
808				case TGSI_SEMANTIC_FACE:
809					if (ctx->face_gpr != -1)
810						ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
811					else
812						ctx->face_gpr = ctx->shader->input[i].gpr;
813					break;
814				case TGSI_SEMANTIC_COLOR:
815					ctx->colors_used++;
816					break;
817				case TGSI_SEMANTIC_POSITION:
818					ctx->fragcoord_input = i;
819					break;
820				case TGSI_SEMANTIC_PRIMID:
821					/* set this for now */
822					ctx->shader->gs_prim_id_input = true;
823					ctx->shader->ps_prim_id_input = i;
824					break;
825				}
826				if (ctx->bc->chip_class >= EVERGREEN) {
827					if ((r = evergreen_interp_input(ctx, i)))
828						return r;
829				}
830			} else if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
831				/* FIXME probably skip inputs if they aren't passed in the ring */
832				ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
833				ctx->next_ring_offset += 16;
834				if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
835					ctx->shader->gs_prim_id_input = true;
836			}
837		}
838		ctx->shader->ninput += count;
839		break;
840	case TGSI_FILE_OUTPUT:
841		for (j = 0; j < count; j++) {
842			i = ctx->shader->noutput + j;
843			assert(i < Elements(ctx->shader->output));
844			ctx->shader->output[i].name = d->Semantic.Name;
845			ctx->shader->output[i].sid = d->Semantic.Index + j;
846			ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
847			ctx->shader->output[i].interpolate = d->Interp.Interpolate;
848			ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
849			if (ctx->type == TGSI_PROCESSOR_VERTEX ||
850			    ctx->type == TGSI_PROCESSOR_GEOMETRY) {
851				ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
852				switch (d->Semantic.Name) {
853				case TGSI_SEMANTIC_CLIPDIST:
854					ctx->shader->clip_dist_write |= d->Declaration.UsageMask <<
855									((d->Semantic.Index + j) << 2);
856					break;
857				case TGSI_SEMANTIC_PSIZE:
858					ctx->shader->vs_out_misc_write = 1;
859					ctx->shader->vs_out_point_size = 1;
860					break;
861				case TGSI_SEMANTIC_EDGEFLAG:
862					ctx->shader->vs_out_misc_write = 1;
863					ctx->shader->vs_out_edgeflag = 1;
864					ctx->edgeflag_output = i;
865					break;
866				case TGSI_SEMANTIC_VIEWPORT_INDEX:
867					ctx->shader->vs_out_misc_write = 1;
868					ctx->shader->vs_out_viewport = 1;
869					break;
870				case TGSI_SEMANTIC_LAYER:
871					ctx->shader->vs_out_misc_write = 1;
872					ctx->shader->vs_out_layer = 1;
873					break;
874				case TGSI_SEMANTIC_CLIPVERTEX:
875					ctx->clip_vertex_write = TRUE;
876					ctx->cv_output = i;
877					break;
878				}
879				if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
880					ctx->gs_out_ring_offset += 16;
881				}
882			} else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
883				switch (d->Semantic.Name) {
884				case TGSI_SEMANTIC_COLOR:
885					ctx->shader->nr_ps_max_color_exports++;
886					break;
887				}
888			}
889		}
890		ctx->shader->noutput += count;
891		break;
892	case TGSI_FILE_TEMPORARY:
893		if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
894			if (d->Array.ArrayID) {
895				r600_add_gpr_array(ctx->shader,
896				               ctx->file_offset[TGSI_FILE_TEMPORARY] +
897								   d->Range.First,
898				               d->Range.Last - d->Range.First + 1, 0x0F);
899			}
900		}
901		break;
902
903	case TGSI_FILE_CONSTANT:
904	case TGSI_FILE_SAMPLER:
905	case TGSI_FILE_SAMPLER_VIEW:
906	case TGSI_FILE_ADDRESS:
907		break;
908
909	case TGSI_FILE_SYSTEM_VALUE:
910		if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
911			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
912			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
913			break; /* Already handled from allocate_system_value_inputs */
914		} else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
915			if (!ctx->native_integers) {
916				struct r600_bytecode_alu alu;
917				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
918
919				alu.op = ALU_OP1_INT_TO_FLT;
920				alu.src[0].sel = 0;
921				alu.src[0].chan = 3;
922
923				alu.dst.sel = 0;
924				alu.dst.chan = 3;
925				alu.dst.write = 1;
926				alu.last = 1;
927
928				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
929					return r;
930			}
931			break;
932		} else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
933			break;
934		else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
935			break;
936	default:
937		R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
938		return -EINVAL;
939	}
940	return 0;
941}
942
943static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
944{
945	struct tgsi_parse_context parse;
946	struct {
947		boolean enabled;
948		int *reg;
949		unsigned name, alternate_name;
950	} inputs[2] = {
951		{ false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
952
953		{ false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
954	};
955	int i, k, num_regs = 0;
956
957	if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
958		return 0;
959	}
960
961	/* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
962	while (!tgsi_parse_end_of_tokens(&parse)) {
963		tgsi_parse_token(&parse);
964
965		if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
966			const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
967			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
968				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
969				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
970			{
971				int interpolate, location, k;
972
973				if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
974					location = TGSI_INTERPOLATE_LOC_CENTER;
975					inputs[1].enabled = true; /* needs SAMPLEID */
976				} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
977					location = TGSI_INTERPOLATE_LOC_CENTER;
978					/* Needs sample positions, currently those are always available */
979				} else {
980					location = TGSI_INTERPOLATE_LOC_CENTROID;
981				}
982
983				interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
984				k = eg_get_interpolator_index(interpolate, location);
985				ctx->eg_interpolators[k].enabled = true;
986			}
987		} else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
988			struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
989			if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
990				for (k = 0; k < Elements(inputs); k++) {
991					if (d->Semantic.Name == inputs[k].name ||
992						d->Semantic.Name == inputs[k].alternate_name) {
993						inputs[k].enabled = true;
994					}
995				}
996			}
997		}
998	}
999
1000	tgsi_parse_free(&parse);
1001
1002	for (i = 0; i < Elements(inputs); i++) {
1003		boolean enabled = inputs[i].enabled;
1004		int *reg = inputs[i].reg;
1005		unsigned name = inputs[i].name;
1006
1007		if (enabled) {
1008			int gpr = gpr_offset + num_regs++;
1009
1010			// add to inputs, allocate a gpr
1011			k = ctx->shader->ninput ++;
1012			ctx->shader->input[k].name = name;
1013			ctx->shader->input[k].sid = 0;
1014			ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
1015			ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
1016			*reg = ctx->shader->input[k].gpr = gpr;
1017		}
1018	}
1019
1020	return gpr_offset + num_regs;
1021}
1022
1023/*
1024 * for evergreen we need to scan the shader to find the number of GPRs we need to
1025 * reserve for interpolation and system values
1026 *
1027 * we need to know if we are going to emit
1028 * any sample or centroid inputs
1029 * if perspective and linear are required
1030*/
1031static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
1032{
1033	int i;
1034	int num_baryc;
1035	struct tgsi_parse_context parse;
1036
1037	memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
1038
1039	for (i = 0; i < ctx->info.num_inputs; i++) {
1040		int k;
1041		/* skip position/face/mask/sampleid */
1042		if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
1043		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
1044		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
1045		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
1046			continue;
1047
1048		k = eg_get_interpolator_index(
1049			ctx->info.input_interpolate[i],
1050			ctx->info.input_interpolate_loc[i]);
1051		if (k >= 0)
1052			ctx->eg_interpolators[k].enabled = TRUE;
1053	}
1054
1055	if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1056		return 0;
1057	}
1058
1059	/* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1060	while (!tgsi_parse_end_of_tokens(&parse)) {
1061		tgsi_parse_token(&parse);
1062
1063		if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1064			const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1065			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1066				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1067				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1068			{
1069				int interpolate, location, k;
1070
1071				if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1072					location = TGSI_INTERPOLATE_LOC_CENTER;
1073				} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1074					location = TGSI_INTERPOLATE_LOC_CENTER;
1075				} else {
1076					location = TGSI_INTERPOLATE_LOC_CENTROID;
1077				}
1078
1079				interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1080				k = eg_get_interpolator_index(interpolate, location);
1081				ctx->eg_interpolators[k].enabled = true;
1082			}
1083		}
1084	}
1085
1086	tgsi_parse_free(&parse);
1087
1088	/* assign gpr to each interpolator according to priority */
1089	num_baryc = 0;
1090	for (i = 0; i < Elements(ctx->eg_interpolators); i++) {
1091		if (ctx->eg_interpolators[i].enabled) {
1092			ctx->eg_interpolators[i].ij_index = num_baryc;
1093			num_baryc ++;
1094		}
1095	}
1096
1097	/* XXX PULL MODEL and LINE STIPPLE */
1098
1099	num_baryc = (num_baryc + 1) >> 1;
1100	return allocate_system_value_inputs(ctx, num_baryc);
1101}
1102
1103/* sample_id_sel == NULL means fetch for current sample */
1104static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
1105{
1106	struct r600_bytecode_vtx vtx;
1107	int r, t1;
1108
1109	assert(ctx->fixed_pt_position_gpr != -1);
1110
1111	t1 = r600_get_temp(ctx);
1112
1113	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1114	vtx.op = FETCH_OP_VFETCH;
1115	vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1116	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1117	if (sample_id == NULL) {
1118		vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
1119		vtx.src_sel_x = 3;
1120	}
1121	else {
1122		struct r600_bytecode_alu alu;
1123
1124		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1125		alu.op = ALU_OP1_MOV;
1126		r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
1127		alu.dst.sel = t1;
1128		alu.dst.write = 1;
1129		alu.last = 1;
1130		r = r600_bytecode_add_alu(ctx->bc, &alu);
1131		if (r)
1132			return r;
1133
1134		vtx.src_gpr = t1;
1135		vtx.src_sel_x = 0;
1136	}
1137	vtx.mega_fetch_count = 16;
1138	vtx.dst_gpr = t1;
1139	vtx.dst_sel_x = 0;
1140	vtx.dst_sel_y = 1;
1141	vtx.dst_sel_z = 2;
1142	vtx.dst_sel_w = 3;
1143	vtx.data_format = FMT_32_32_32_32_FLOAT;
1144	vtx.num_format_all = 2;
1145	vtx.format_comp_all = 1;
1146	vtx.use_const_fields = 0;
1147	vtx.offset = 1; // first element is size of buffer
1148	vtx.endian = r600_endian_swap(32);
1149	vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1150
1151	r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1152	if (r)
1153		return r;
1154
1155	return t1;
1156}
1157
1158static void tgsi_src(struct r600_shader_ctx *ctx,
1159		     const struct tgsi_full_src_register *tgsi_src,
1160		     struct r600_shader_src *r600_src)
1161{
1162	memset(r600_src, 0, sizeof(*r600_src));
1163	r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
1164	r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1165	r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1166	r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1167	r600_src->neg = tgsi_src->Register.Negate;
1168	r600_src->abs = tgsi_src->Register.Absolute;
1169
1170	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1171		int index;
1172		if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1173			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1174			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1175
1176			index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1177			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs);
1178			if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1179				return;
1180		}
1181		index = tgsi_src->Register.Index;
1182		r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1183		memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1184	} else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1185		if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1186			r600_src->swizzle[0] = 2; // Z value
1187			r600_src->swizzle[1] = 2;
1188			r600_src->swizzle[2] = 2;
1189			r600_src->swizzle[3] = 2;
1190			r600_src->sel = ctx->face_gpr;
1191		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1192			r600_src->swizzle[0] = 3; // W value
1193			r600_src->swizzle[1] = 3;
1194			r600_src->swizzle[2] = 3;
1195			r600_src->swizzle[3] = 3;
1196			r600_src->sel = ctx->fixed_pt_position_gpr;
1197		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1198			r600_src->swizzle[0] = 0;
1199			r600_src->swizzle[1] = 1;
1200			r600_src->swizzle[2] = 4;
1201			r600_src->swizzle[3] = 4;
1202			r600_src->sel = load_sample_position(ctx, NULL, -1);
1203		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1204			r600_src->swizzle[0] = 3;
1205			r600_src->swizzle[1] = 3;
1206			r600_src->swizzle[2] = 3;
1207			r600_src->swizzle[3] = 3;
1208			r600_src->sel = 0;
1209		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1210			r600_src->swizzle[0] = 0;
1211			r600_src->swizzle[1] = 0;
1212			r600_src->swizzle[2] = 0;
1213			r600_src->swizzle[3] = 0;
1214			r600_src->sel = 0;
1215		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1216			r600_src->swizzle[0] = 3;
1217			r600_src->swizzle[1] = 3;
1218			r600_src->swizzle[2] = 3;
1219			r600_src->swizzle[3] = 3;
1220			r600_src->sel = 1;
1221		}
1222	} else {
1223		if (tgsi_src->Register.Indirect)
1224			r600_src->rel = V_SQ_REL_RELATIVE;
1225		r600_src->sel = tgsi_src->Register.Index;
1226		r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1227	}
1228	if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1229		if (tgsi_src->Register.Dimension) {
1230			r600_src->kc_bank = tgsi_src->Dimension.Index;
1231			if (tgsi_src->Dimension.Indirect) {
1232				r600_src->kc_rel = 1;
1233			}
1234		}
1235	}
1236}
1237
1238static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1239                                unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1240                                unsigned int dst_reg)
1241{
1242	struct r600_bytecode_vtx vtx;
1243	unsigned int ar_reg;
1244	int r;
1245
1246	if (offset) {
1247		struct r600_bytecode_alu alu;
1248
1249		memset(&alu, 0, sizeof(alu));
1250
1251		alu.op = ALU_OP2_ADD_INT;
1252		alu.src[0].sel = ctx->bc->ar_reg;
1253		alu.src[0].chan = ar_chan;
1254
1255		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1256		alu.src[1].value = offset;
1257
1258		alu.dst.sel = dst_reg;
1259		alu.dst.chan = ar_chan;
1260		alu.dst.write = 1;
1261		alu.last = 1;
1262
1263		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1264			return r;
1265
1266		ar_reg = dst_reg;
1267	} else {
1268		ar_reg = ctx->bc->ar_reg;
1269	}
1270
1271	memset(&vtx, 0, sizeof(vtx));
1272	vtx.buffer_id = cb_idx;
1273	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1274	vtx.src_gpr = ar_reg;
1275	vtx.src_sel_x = ar_chan;
1276	vtx.mega_fetch_count = 16;
1277	vtx.dst_gpr = dst_reg;
1278	vtx.dst_sel_x = 0;		/* SEL_X */
1279	vtx.dst_sel_y = 1;		/* SEL_Y */
1280	vtx.dst_sel_z = 2;		/* SEL_Z */
1281	vtx.dst_sel_w = 3;		/* SEL_W */
1282	vtx.data_format = FMT_32_32_32_32_FLOAT;
1283	vtx.num_format_all = 2;		/* NUM_FORMAT_SCALED */
1284	vtx.format_comp_all = 1;	/* FORMAT_COMP_SIGNED */
1285	vtx.endian = r600_endian_swap(32);
1286	vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1287
1288	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1289		return r;
1290
1291	return 0;
1292}
1293
1294static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1295{
1296	struct r600_bytecode_vtx vtx;
1297	int r;
1298	unsigned index = src->Register.Index;
1299	unsigned vtx_id = src->Dimension.Index;
1300	int offset_reg = vtx_id / 3;
1301	int offset_chan = vtx_id % 3;
1302
1303	/* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1304	 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1305
1306	if (offset_reg == 0 && offset_chan == 2)
1307		offset_chan = 3;
1308
1309	if (src->Dimension.Indirect) {
1310		int treg[3];
1311		int t2;
1312		struct r600_bytecode_alu alu;
1313		int r, i;
1314
1315		/* you have got to be shitting me -
1316		   we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1317		   at least this is what fglrx seems to do. */
1318		for (i = 0; i < 3; i++) {
1319			treg[i] = r600_get_temp(ctx);
1320		}
1321		r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
1322
1323		t2 = r600_get_temp(ctx);
1324		for (i = 0; i < 3; i++) {
1325			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1326			alu.op = ALU_OP1_MOV;
1327			alu.src[0].sel = 0;
1328			alu.src[0].chan = i == 2 ? 3 : i;
1329			alu.dst.sel = treg[i];
1330			alu.dst.chan = 0;
1331			alu.dst.write = 1;
1332			alu.last = 1;
1333			r = r600_bytecode_add_alu(ctx->bc, &alu);
1334			if (r)
1335				return r;
1336		}
1337		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1338		alu.op = ALU_OP1_MOV;
1339		alu.src[0].sel = treg[0];
1340		alu.src[0].rel = 1;
1341		alu.dst.sel = t2;
1342		alu.dst.write = 1;
1343		alu.last = 1;
1344		r = r600_bytecode_add_alu(ctx->bc, &alu);
1345		if (r)
1346			return r;
1347		offset_reg = t2;
1348	}
1349
1350
1351	memset(&vtx, 0, sizeof(vtx));
1352	vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1353	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1354	vtx.src_gpr = offset_reg;
1355	vtx.src_sel_x = offset_chan;
1356	vtx.offset = index * 16; /*bytes*/
1357	vtx.mega_fetch_count = 16;
1358	vtx.dst_gpr = dst_reg;
1359	vtx.dst_sel_x = 0;		/* SEL_X */
1360	vtx.dst_sel_y = 1;		/* SEL_Y */
1361	vtx.dst_sel_z = 2;		/* SEL_Z */
1362	vtx.dst_sel_w = 3;		/* SEL_W */
1363	if (ctx->bc->chip_class >= EVERGREEN) {
1364		vtx.use_const_fields = 1;
1365	} else {
1366		vtx.data_format = FMT_32_32_32_32_FLOAT;
1367	}
1368
1369	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1370		return r;
1371
1372	return 0;
1373}
1374
1375static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1376{
1377	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1378	int i;
1379
1380	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1381		struct tgsi_full_src_register *src = &inst->Src[i];
1382
1383		if (src->Register.File == TGSI_FILE_INPUT) {
1384			if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
1385				/* primitive id is in R0.z */
1386				ctx->src[i].sel = 0;
1387				ctx->src[i].swizzle[0] = 2;
1388			}
1389		}
1390		if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
1391			int treg = r600_get_temp(ctx);
1392
1393			fetch_gs_input(ctx, src, treg);
1394			ctx->src[i].sel = treg;
1395		}
1396	}
1397	return 0;
1398}
1399
1400static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1401{
1402	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1403	struct r600_bytecode_alu alu;
1404	int i, j, k, nconst, r;
1405
1406	for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1407		if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1408			nconst++;
1409		}
1410		tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1411	}
1412	for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1413		if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1414			continue;
1415		}
1416
1417		if (ctx->src[i].rel) {
1418			int chan = inst->Src[i].Indirect.Swizzle;
1419			int treg = r600_get_temp(ctx);
1420			if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
1421				return r;
1422
1423			ctx->src[i].kc_bank = 0;
1424			ctx->src[i].kc_rel = 0;
1425			ctx->src[i].sel = treg;
1426			ctx->src[i].rel = 0;
1427			j--;
1428		} else if (j > 0) {
1429			int treg = r600_get_temp(ctx);
1430			for (k = 0; k < 4; k++) {
1431				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1432				alu.op = ALU_OP1_MOV;
1433				alu.src[0].sel = ctx->src[i].sel;
1434				alu.src[0].chan = k;
1435				alu.src[0].rel = ctx->src[i].rel;
1436				alu.src[0].kc_bank = ctx->src[i].kc_bank;
1437				alu.src[0].kc_rel = ctx->src[i].kc_rel;
1438				alu.dst.sel = treg;
1439				alu.dst.chan = k;
1440				alu.dst.write = 1;
1441				if (k == 3)
1442					alu.last = 1;
1443				r = r600_bytecode_add_alu(ctx->bc, &alu);
1444				if (r)
1445					return r;
1446			}
1447			ctx->src[i].sel = treg;
1448			ctx->src[i].rel =0;
1449			j--;
1450		}
1451	}
1452	return 0;
1453}
1454
1455/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
1456static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
1457{
1458	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1459	struct r600_bytecode_alu alu;
1460	int i, j, k, nliteral, r;
1461
1462	for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
1463		if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1464			nliteral++;
1465		}
1466	}
1467	for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
1468		if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1469			int treg = r600_get_temp(ctx);
1470			for (k = 0; k < 4; k++) {
1471				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1472				alu.op = ALU_OP1_MOV;
1473				alu.src[0].sel = ctx->src[i].sel;
1474				alu.src[0].chan = k;
1475				alu.src[0].value = ctx->src[i].value[k];
1476				alu.dst.sel = treg;
1477				alu.dst.chan = k;
1478				alu.dst.write = 1;
1479				if (k == 3)
1480					alu.last = 1;
1481				r = r600_bytecode_add_alu(ctx->bc, &alu);
1482				if (r)
1483					return r;
1484			}
1485			ctx->src[i].sel = treg;
1486			j--;
1487		}
1488	}
1489	return 0;
1490}
1491
1492static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
1493{
1494	int i, r, count = ctx->shader->ninput;
1495
1496	for (i = 0; i < count; i++) {
1497		if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1498			r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
1499			if (r)
1500				return r;
1501		}
1502	}
1503	return 0;
1504}
1505
1506static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
1507						  int stream, unsigned *stream_item_size)
1508{
1509	unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
1510	unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
1511	int i, j, r;
1512
1513	/* Sanity checking. */
1514	if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
1515		R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
1516		r = -EINVAL;
1517		goto out_err;
1518	}
1519	for (i = 0; i < so->num_outputs; i++) {
1520		if (so->output[i].output_buffer >= 4) {
1521			R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
1522				 so->output[i].output_buffer);
1523			r = -EINVAL;
1524			goto out_err;
1525		}
1526	}
1527
1528	/* Initialize locations where the outputs are stored. */
1529	for (i = 0; i < so->num_outputs; i++) {
1530
1531		so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
1532		start_comp[i] = so->output[i].start_component;
1533		/* Lower outputs with dst_offset < start_component.
1534		 *
1535		 * We can only output 4D vectors with a write mask, e.g. we can
1536		 * only output the W component at offset 3, etc. If we want
1537		 * to store Y, Z, or W at buffer offset 0, we need to use MOV
1538		 * to move it to X and output X. */
1539		if (so->output[i].dst_offset < so->output[i].start_component) {
1540			unsigned tmp = r600_get_temp(ctx);
1541
1542			for (j = 0; j < so->output[i].num_components; j++) {
1543				struct r600_bytecode_alu alu;
1544				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1545				alu.op = ALU_OP1_MOV;
1546				alu.src[0].sel = so_gpr[i];
1547				alu.src[0].chan = so->output[i].start_component + j;
1548
1549				alu.dst.sel = tmp;
1550				alu.dst.chan = j;
1551				alu.dst.write = 1;
1552				if (j == so->output[i].num_components - 1)
1553					alu.last = 1;
1554				r = r600_bytecode_add_alu(ctx->bc, &alu);
1555				if (r)
1556					return r;
1557			}
1558			start_comp[i] = 0;
1559			so_gpr[i] = tmp;
1560		}
1561	}
1562
1563	/* Write outputs to buffers. */
1564	for (i = 0; i < so->num_outputs; i++) {
1565		struct r600_bytecode_output output;
1566
1567		if (stream != -1 && stream != so->output[i].output_buffer)
1568			continue;
1569
1570		memset(&output, 0, sizeof(struct r600_bytecode_output));
1571		output.gpr = so_gpr[i];
1572		output.elem_size = so->output[i].num_components - 1;
1573		if (output.elem_size == 2)
1574			output.elem_size = 3; // 3 not supported, write 4 with junk at end
1575		output.array_base = so->output[i].dst_offset - start_comp[i];
1576		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1577		output.burst_count = 1;
1578		/* array_size is an upper limit for the burst_count
1579		 * with MEM_STREAM instructions */
1580		output.array_size = 0xFFF;
1581		output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
1582
1583		if (ctx->bc->chip_class >= EVERGREEN) {
1584			switch (so->output[i].output_buffer) {
1585			case 0:
1586				output.op = CF_OP_MEM_STREAM0_BUF0;
1587				break;
1588			case 1:
1589				output.op = CF_OP_MEM_STREAM0_BUF1;
1590				break;
1591			case 2:
1592				output.op = CF_OP_MEM_STREAM0_BUF2;
1593				break;
1594			case 3:
1595				output.op = CF_OP_MEM_STREAM0_BUF3;
1596				break;
1597			}
1598			output.op += so->output[i].stream * 4;
1599			assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
1600			ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
1601		} else {
1602			switch (so->output[i].output_buffer) {
1603			case 0:
1604				output.op = CF_OP_MEM_STREAM0;
1605				break;
1606			case 1:
1607				output.op = CF_OP_MEM_STREAM1;
1608				break;
1609			case 2:
1610				output.op = CF_OP_MEM_STREAM2;
1611				break;
1612			case 3:
1613				output.op = CF_OP_MEM_STREAM3;
1614					break;
1615			}
1616			ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
1617		}
1618		r = r600_bytecode_add_output(ctx->bc, &output);
1619		if (r)
1620			goto out_err;
1621	}
1622	return 0;
1623out_err:
1624	return r;
1625}
1626
1627static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
1628{
1629	struct r600_bytecode_alu alu;
1630	unsigned reg;
1631
1632	if (!ctx->shader->vs_out_edgeflag)
1633		return;
1634
1635	reg = ctx->shader->output[ctx->edgeflag_output].gpr;
1636
1637	/* clamp(x, 0, 1) */
1638	memset(&alu, 0, sizeof(alu));
1639	alu.op = ALU_OP1_MOV;
1640	alu.src[0].sel = reg;
1641	alu.dst.sel = reg;
1642	alu.dst.write = 1;
1643	alu.dst.clamp = 1;
1644	alu.last = 1;
1645	r600_bytecode_add_alu(ctx->bc, &alu);
1646
1647	memset(&alu, 0, sizeof(alu));
1648	alu.op = ALU_OP1_FLT_TO_INT;
1649	alu.src[0].sel = reg;
1650	alu.dst.sel = reg;
1651	alu.dst.write = 1;
1652	alu.last = 1;
1653	r600_bytecode_add_alu(ctx->bc, &alu);
1654}
1655
1656static int generate_gs_copy_shader(struct r600_context *rctx,
1657				   struct r600_pipe_shader *gs,
1658				   struct pipe_stream_output_info *so)
1659{
1660	struct r600_shader_ctx ctx = {};
1661	struct r600_shader *gs_shader = &gs->shader;
1662	struct r600_pipe_shader *cshader;
1663	int ocnt = gs_shader->noutput;
1664	struct r600_bytecode_alu alu;
1665	struct r600_bytecode_vtx vtx;
1666	struct r600_bytecode_output output;
1667	struct r600_bytecode_cf *cf_jump, *cf_pop,
1668		*last_exp_pos = NULL, *last_exp_param = NULL;
1669	int i, j, next_clip_pos = 61, next_param = 0;
1670	int ring;
1671
1672	cshader = calloc(1, sizeof(struct r600_pipe_shader));
1673	if (!cshader)
1674		return 0;
1675
1676	memcpy(cshader->shader.output, gs_shader->output, ocnt *
1677	       sizeof(struct r600_shader_io));
1678
1679	cshader->shader.noutput = ocnt;
1680
1681	ctx.shader = &cshader->shader;
1682	ctx.bc = &ctx.shader->bc;
1683	ctx.type = ctx.bc->type = TGSI_PROCESSOR_VERTEX;
1684
1685	r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
1686			   rctx->screen->has_compressed_msaa_texturing);
1687
1688	ctx.bc->isa = rctx->isa;
1689
1690	cf_jump = NULL;
1691	memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
1692
1693	/* R0.x = R0.x & 0x3fffffff */
1694	memset(&alu, 0, sizeof(alu));
1695	alu.op = ALU_OP2_AND_INT;
1696	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1697	alu.src[1].value = 0x3fffffff;
1698	alu.dst.write = 1;
1699	r600_bytecode_add_alu(ctx.bc, &alu);
1700
1701	/* R0.y = R0.x >> 30 */
1702	memset(&alu, 0, sizeof(alu));
1703	alu.op = ALU_OP2_LSHR_INT;
1704	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1705	alu.src[1].value = 0x1e;
1706	alu.dst.chan = 1;
1707	alu.dst.write = 1;
1708	alu.last = 1;
1709	r600_bytecode_add_alu(ctx.bc, &alu);
1710
1711	/* fetch vertex data from GSVS ring */
1712	for (i = 0; i < ocnt; ++i) {
1713		struct r600_shader_io *out = &ctx.shader->output[i];
1714
1715		out->gpr = i + 1;
1716		out->ring_offset = i * 16;
1717
1718		memset(&vtx, 0, sizeof(vtx));
1719		vtx.op = FETCH_OP_VFETCH;
1720		vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1721		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1722		vtx.mega_fetch_count = 16;
1723		vtx.offset = out->ring_offset;
1724		vtx.dst_gpr = out->gpr;
1725		vtx.src_gpr = 0;
1726		vtx.dst_sel_x = 0;
1727		vtx.dst_sel_y = 1;
1728		vtx.dst_sel_z = 2;
1729		vtx.dst_sel_w = 3;
1730		if (rctx->b.chip_class >= EVERGREEN) {
1731			vtx.use_const_fields = 1;
1732		} else {
1733			vtx.data_format = FMT_32_32_32_32_FLOAT;
1734		}
1735
1736		r600_bytecode_add_vtx(ctx.bc, &vtx);
1737	}
1738	ctx.temp_reg = i + 1;
1739	for (ring = 3; ring >= 0; --ring) {
1740		bool enabled = false;
1741		for (i = 0; i < so->num_outputs; i++) {
1742			if (so->output[i].stream == ring) {
1743				enabled = true;
1744				break;
1745			}
1746		}
1747		if (ring != 0 && !enabled) {
1748			cshader->shader.ring_item_sizes[ring] = 0;
1749			continue;
1750		}
1751
1752		if (cf_jump) {
1753			// Patch up jump label
1754			r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
1755			cf_pop = ctx.bc->cf_last;
1756
1757			cf_jump->cf_addr = cf_pop->id + 2;
1758			cf_jump->pop_count = 1;
1759			cf_pop->cf_addr = cf_pop->id + 2;
1760			cf_pop->pop_count = 1;
1761		}
1762
1763		/* PRED_SETE_INT __, R0.y, ring */
1764		memset(&alu, 0, sizeof(alu));
1765		alu.op = ALU_OP2_PRED_SETE_INT;
1766		alu.src[0].chan = 1;
1767		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1768		alu.src[1].value = ring;
1769		alu.execute_mask = 1;
1770		alu.update_pred = 1;
1771		alu.last = 1;
1772		r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
1773
1774		r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
1775		cf_jump = ctx.bc->cf_last;
1776
1777		if (enabled)
1778			emit_streamout(&ctx, so, ring, &cshader->shader.ring_item_sizes[ring]);
1779		cshader->shader.ring_item_sizes[ring] = ocnt * 16;
1780	}
1781
1782	/* bc adds nops - copy it */
1783	if (ctx.bc->chip_class == R600) {
1784		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1785		alu.op = ALU_OP0_NOP;
1786		alu.last = 1;
1787		r600_bytecode_add_alu(ctx.bc, &alu);
1788
1789		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
1790	}
1791
1792	/* export vertex data */
1793	/* XXX factor out common code with r600_shader_from_tgsi ? */
1794	for (i = 0; i < ocnt; ++i) {
1795		struct r600_shader_io *out = &ctx.shader->output[i];
1796		bool instream0 = true;
1797		if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
1798			continue;
1799
1800		for (j = 0; j < so->num_outputs; j++) {
1801			if (so->output[j].register_index == i) {
1802				if (so->output[j].stream == 0)
1803					break;
1804				if (so->output[j].stream > 0)
1805					instream0 = false;
1806			}
1807		}
1808		if (!instream0)
1809			continue;
1810		memset(&output, 0, sizeof(output));
1811		output.gpr = out->gpr;
1812		output.elem_size = 3;
1813		output.swizzle_x = 0;
1814		output.swizzle_y = 1;
1815		output.swizzle_z = 2;
1816		output.swizzle_w = 3;
1817		output.burst_count = 1;
1818		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1819		output.op = CF_OP_EXPORT;
1820		switch (out->name) {
1821		case TGSI_SEMANTIC_POSITION:
1822			output.array_base = 60;
1823			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1824			break;
1825
1826		case TGSI_SEMANTIC_PSIZE:
1827			output.array_base = 61;
1828			if (next_clip_pos == 61)
1829				next_clip_pos = 62;
1830			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1831			output.swizzle_y = 7;
1832			output.swizzle_z = 7;
1833			output.swizzle_w = 7;
1834			ctx.shader->vs_out_misc_write = 1;
1835			ctx.shader->vs_out_point_size = 1;
1836			break;
1837		case TGSI_SEMANTIC_LAYER:
1838			if (out->spi_sid) {
1839				/* duplicate it as PARAM to pass to the pixel shader */
1840				output.array_base = next_param++;
1841				r600_bytecode_add_output(ctx.bc, &output);
1842				last_exp_param = ctx.bc->cf_last;
1843			}
1844			output.array_base = 61;
1845			if (next_clip_pos == 61)
1846				next_clip_pos = 62;
1847			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1848			output.swizzle_x = 7;
1849			output.swizzle_y = 7;
1850			output.swizzle_z = 0;
1851			output.swizzle_w = 7;
1852			ctx.shader->vs_out_misc_write = 1;
1853			ctx.shader->vs_out_layer = 1;
1854			break;
1855		case TGSI_SEMANTIC_VIEWPORT_INDEX:
1856			if (out->spi_sid) {
1857				/* duplicate it as PARAM to pass to the pixel shader */
1858				output.array_base = next_param++;
1859				r600_bytecode_add_output(ctx.bc, &output);
1860				last_exp_param = ctx.bc->cf_last;
1861			}
1862			output.array_base = 61;
1863			if (next_clip_pos == 61)
1864				next_clip_pos = 62;
1865			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1866			ctx.shader->vs_out_misc_write = 1;
1867			ctx.shader->vs_out_viewport = 1;
1868			output.swizzle_x = 7;
1869			output.swizzle_y = 7;
1870			output.swizzle_z = 7;
1871			output.swizzle_w = 0;
1872			break;
1873		case TGSI_SEMANTIC_CLIPDIST:
1874			/* spi_sid is 0 for clipdistance outputs that were generated
1875			 * for clipvertex - we don't need to pass them to PS */
1876			ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
1877			if (out->spi_sid) {
1878				/* duplicate it as PARAM to pass to the pixel shader */
1879				output.array_base = next_param++;
1880				r600_bytecode_add_output(ctx.bc, &output);
1881				last_exp_param = ctx.bc->cf_last;
1882			}
1883			output.array_base = next_clip_pos++;
1884			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1885			break;
1886		case TGSI_SEMANTIC_FOG:
1887			output.swizzle_y = 4; /* 0 */
1888			output.swizzle_z = 4; /* 0 */
1889			output.swizzle_w = 5; /* 1 */
1890			break;
1891		default:
1892			output.array_base = next_param++;
1893			break;
1894		}
1895		r600_bytecode_add_output(ctx.bc, &output);
1896		if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
1897			last_exp_param = ctx.bc->cf_last;
1898		else
1899			last_exp_pos = ctx.bc->cf_last;
1900	}
1901
1902	if (!last_exp_pos) {
1903		memset(&output, 0, sizeof(output));
1904		output.gpr = 0;
1905		output.elem_size = 3;
1906		output.swizzle_x = 7;
1907		output.swizzle_y = 7;
1908		output.swizzle_z = 7;
1909		output.swizzle_w = 7;
1910		output.burst_count = 1;
1911		output.type = 2;
1912		output.op = CF_OP_EXPORT;
1913		output.array_base = 60;
1914		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1915		r600_bytecode_add_output(ctx.bc, &output);
1916		last_exp_pos = ctx.bc->cf_last;
1917	}
1918
1919	if (!last_exp_param) {
1920		memset(&output, 0, sizeof(output));
1921		output.gpr = 0;
1922		output.elem_size = 3;
1923		output.swizzle_x = 7;
1924		output.swizzle_y = 7;
1925		output.swizzle_z = 7;
1926		output.swizzle_w = 7;
1927		output.burst_count = 1;
1928		output.type = 2;
1929		output.op = CF_OP_EXPORT;
1930		output.array_base = next_param++;
1931		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1932		r600_bytecode_add_output(ctx.bc, &output);
1933		last_exp_param = ctx.bc->cf_last;
1934	}
1935
1936	last_exp_pos->op = CF_OP_EXPORT_DONE;
1937	last_exp_param->op = CF_OP_EXPORT_DONE;
1938
1939	r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
1940	cf_pop = ctx.bc->cf_last;
1941
1942	cf_jump->cf_addr = cf_pop->id + 2;
1943	cf_jump->pop_count = 1;
1944	cf_pop->cf_addr = cf_pop->id + 2;
1945	cf_pop->pop_count = 1;
1946
1947	if (ctx.bc->chip_class == CAYMAN)
1948		cm_bytecode_add_cf_end(ctx.bc);
1949	else {
1950		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
1951		ctx.bc->cf_last->end_of_program = 1;
1952	}
1953
1954	gs->gs_copy_shader = cshader;
1955	cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
1956
1957	ctx.bc->nstack = 1;
1958
1959	return r600_bytecode_build(ctx.bc);
1960}
1961
1962static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind)
1963{
1964	if (ind) {
1965		struct r600_bytecode_alu alu;
1966		int r;
1967
1968		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1969		alu.op = ALU_OP2_ADD_INT;
1970		alu.src[0].sel = ctx->gs_export_gpr_tregs[idx];
1971		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1972		alu.src[1].value = ctx->gs_out_ring_offset >> 4;
1973		alu.dst.sel = ctx->gs_export_gpr_tregs[idx];
1974		alu.dst.write = 1;
1975		alu.last = 1;
1976		r = r600_bytecode_add_alu(ctx->bc, &alu);
1977		if (r)
1978			return r;
1979	}
1980	return 0;
1981}
1982
1983static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind)
1984{
1985	struct r600_bytecode_output output;
1986	int i, k, ring_offset;
1987	int effective_stream = stream == -1 ? 0 : stream;
1988	int idx = 0;
1989
1990	for (i = 0; i < ctx->shader->noutput; i++) {
1991		if (ctx->gs_for_vs) {
1992			/* for ES we need to lookup corresponding ring offset expected by GS
1993			 * (map this output to GS input by name and sid) */
1994			/* FIXME precompute offsets */
1995			ring_offset = -1;
1996			for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
1997				struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
1998				struct r600_shader_io *out = &ctx->shader->output[i];
1999				if (in->name == out->name && in->sid == out->sid)
2000					ring_offset = in->ring_offset;
2001			}
2002
2003			if (ring_offset == -1)
2004				continue;
2005		} else {
2006			ring_offset = idx * 16;
2007			idx++;
2008		}
2009
2010		if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
2011			continue;
2012		/* next_ring_offset after parsing input decls contains total size of
2013		 * single vertex data, gs_next_vertex - current vertex index */
2014		if (!ind)
2015			ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
2016
2017		memset(&output, 0, sizeof(struct r600_bytecode_output));
2018		output.gpr = ctx->shader->output[i].gpr;
2019		output.elem_size = 3;
2020		output.comp_mask = 0xF;
2021		output.burst_count = 1;
2022
2023		if (ind)
2024			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
2025		else
2026			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2027
2028		switch (stream) {
2029		default:
2030		case 0:
2031			output.op = CF_OP_MEM_RING; break;
2032		case 1:
2033			output.op = CF_OP_MEM_RING1; break;
2034		case 2:
2035			output.op = CF_OP_MEM_RING2; break;
2036		case 3:
2037			output.op = CF_OP_MEM_RING3; break;
2038		}
2039
2040		if (ind) {
2041			output.array_base = ring_offset >> 2; /* in dwords */
2042			output.array_size = 0xfff;
2043			output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
2044		} else
2045			output.array_base = ring_offset >> 2; /* in dwords */
2046		r600_bytecode_add_output(ctx->bc, &output);
2047	}
2048
2049	++ctx->gs_next_vertex;
2050	return 0;
2051}
2052
2053
2054static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
2055{
2056	int r;
2057	struct r600_bytecode_vtx vtx;
2058	int temp_val = ctx->temp_reg;
2059	/* need to store the TCS output somewhere */
2060	r = single_alu_op2(ctx, ALU_OP1_MOV,
2061			   temp_val, 0,
2062			   V_SQ_ALU_SRC_LITERAL, 0,
2063			   0, 0);
2064	if (r)
2065		return r;
2066
2067	/* used by VS/TCS */
2068	if (ctx->tess_input_info) {
2069		/* fetch tcs input values into resv space */
2070		memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2071		vtx.op = FETCH_OP_VFETCH;
2072		vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2073		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2074		vtx.mega_fetch_count = 16;
2075		vtx.data_format = FMT_32_32_32_32;
2076		vtx.num_format_all = 2;
2077		vtx.format_comp_all = 1;
2078		vtx.use_const_fields = 0;
2079		vtx.endian = r600_endian_swap(32);
2080		vtx.srf_mode_all = 1;
2081		vtx.offset = 0;
2082		vtx.dst_gpr = ctx->tess_input_info;
2083		vtx.dst_sel_x = 0;
2084		vtx.dst_sel_y = 1;
2085		vtx.dst_sel_z = 2;
2086		vtx.dst_sel_w = 3;
2087		vtx.src_gpr = temp_val;
2088		vtx.src_sel_x = 0;
2089
2090		r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2091		if (r)
2092			return r;
2093	}
2094
2095	/* used by TCS/TES */
2096	if (ctx->tess_output_info) {
2097		/* fetch tcs output values into resv space */
2098		memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2099		vtx.op = FETCH_OP_VFETCH;
2100		vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2101		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2102		vtx.mega_fetch_count = 16;
2103		vtx.data_format = FMT_32_32_32_32;
2104		vtx.num_format_all = 2;
2105		vtx.format_comp_all = 1;
2106		vtx.use_const_fields = 0;
2107		vtx.endian = r600_endian_swap(32);
2108		vtx.srf_mode_all = 1;
2109		vtx.offset = 16;
2110		vtx.dst_gpr = ctx->tess_output_info;
2111		vtx.dst_sel_x = 0;
2112		vtx.dst_sel_y = 1;
2113		vtx.dst_sel_z = 2;
2114		vtx.dst_sel_w = 3;
2115		vtx.src_gpr = temp_val;
2116		vtx.src_sel_x = 0;
2117
2118		r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2119		if (r)
2120			return r;
2121	}
2122	return 0;
2123}
2124
2125static int r600_shader_from_tgsi(struct r600_context *rctx,
2126				 struct r600_pipe_shader *pipeshader,
2127				 union r600_shader_key key)
2128{
2129	struct r600_screen *rscreen = rctx->screen;
2130	struct r600_shader *shader = &pipeshader->shader;
2131	struct tgsi_token *tokens = pipeshader->selector->tokens;
2132	struct pipe_stream_output_info so = pipeshader->selector->so;
2133	struct tgsi_full_immediate *immediate;
2134	struct r600_shader_ctx ctx;
2135	struct r600_bytecode_output output[32];
2136	unsigned output_done, noutput;
2137	unsigned opcode;
2138	int i, j, k, r = 0;
2139	int next_param_base = 0, next_clip_base;
2140	int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
2141	/* Declarations used by llvm code */
2142	bool use_llvm = false;
2143	bool indirect_gprs;
2144	bool ring_outputs = false;
2145	bool pos_emitted = false;
2146
2147#ifdef R600_USE_LLVM
2148	use_llvm = rscreen->b.debug_flags & DBG_LLVM;
2149#endif
2150	ctx.bc = &shader->bc;
2151	ctx.shader = shader;
2152	ctx.native_integers = true;
2153
2154
2155	r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
2156			   rscreen->has_compressed_msaa_texturing);
2157	ctx.tokens = tokens;
2158	tgsi_scan_shader(tokens, &ctx.info);
2159	shader->indirect_files = ctx.info.indirect_files;
2160
2161	shader->uses_doubles = ctx.info.uses_doubles;
2162
2163	indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
2164	tgsi_parse_init(&ctx.parse, tokens);
2165	ctx.type = ctx.info.processor;
2166	shader->processor_type = ctx.type;
2167	ctx.bc->type = shader->processor_type;
2168
2169	switch (ctx.type) {
2170	case TGSI_PROCESSOR_VERTEX:
2171		shader->vs_as_gs_a = key.vs.as_gs_a;
2172		shader->vs_as_es = key.vs.as_es;
2173		shader->vs_as_ls = key.vs.as_ls;
2174		if (shader->vs_as_es)
2175			ring_outputs = true;
2176		break;
2177	case TGSI_PROCESSOR_GEOMETRY:
2178		ring_outputs = true;
2179		break;
2180	case TGSI_PROCESSOR_TESS_CTRL:
2181		shader->tcs_prim_mode = key.tcs.prim_mode;
2182		break;
2183	case TGSI_PROCESSOR_TESS_EVAL:
2184		shader->tes_as_es = key.tes.as_es;
2185		if (shader->tes_as_es)
2186			ring_outputs = true;
2187		break;
2188	case TGSI_PROCESSOR_FRAGMENT:
2189		shader->two_side = key.ps.color_two_side;
2190		break;
2191	default:
2192		break;
2193	}
2194
2195	if (shader->vs_as_es || shader->tes_as_es) {
2196		ctx.gs_for_vs = &rctx->gs_shader->current->shader;
2197	} else {
2198		ctx.gs_for_vs = NULL;
2199	}
2200
2201	ctx.next_ring_offset = 0;
2202	ctx.gs_out_ring_offset = 0;
2203	ctx.gs_next_vertex = 0;
2204	ctx.gs_stream_output_info = &so;
2205
2206	ctx.face_gpr = -1;
2207	ctx.fixed_pt_position_gpr = -1;
2208	ctx.fragcoord_input = -1;
2209	ctx.colors_used = 0;
2210	ctx.clip_vertex_write = 0;
2211
2212	shader->nr_ps_color_exports = 0;
2213	shader->nr_ps_max_color_exports = 0;
2214
2215
2216	/* register allocations */
2217	/* Values [0,127] correspond to GPR[0..127].
2218	 * Values [128,159] correspond to constant buffer bank 0
2219	 * Values [160,191] correspond to constant buffer bank 1
2220	 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
2221	 * Values [256,287] correspond to constant buffer bank 2 (EG)
2222	 * Values [288,319] correspond to constant buffer bank 3 (EG)
2223	 * Other special values are shown in the list below.
2224	 * 244  ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
2225	 * 245  ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
2226	 * 246  ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
2227	 * 247  ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
2228	 * 248	SQ_ALU_SRC_0: special constant 0.0.
2229	 * 249	SQ_ALU_SRC_1: special constant 1.0 float.
2230	 * 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
2231	 * 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
2232	 * 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
2233	 * 253	SQ_ALU_SRC_LITERAL: literal constant.
2234	 * 254	SQ_ALU_SRC_PV: previous vector result.
2235	 * 255	SQ_ALU_SRC_PS: previous scalar result.
2236	 */
2237	for (i = 0; i < TGSI_FILE_COUNT; i++) {
2238		ctx.file_offset[i] = 0;
2239	}
2240
2241#ifdef R600_USE_LLVM
2242	if (use_llvm && ctx.info.indirect_files && (ctx.info.indirect_files & (1 << TGSI_FILE_CONSTANT)) != ctx.info.indirect_files) {
2243		fprintf(stderr, "Warning: R600 LLVM backend does not support "
2244				"indirect adressing.  Falling back to TGSI "
2245				"backend.\n");
2246		use_llvm = 0;
2247	}
2248#endif
2249	if (ctx.type == TGSI_PROCESSOR_VERTEX) {
2250		ctx.file_offset[TGSI_FILE_INPUT] = 1;
2251		if (!use_llvm) {
2252			r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
2253		}
2254	}
2255	if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
2256		if (ctx.bc->chip_class >= EVERGREEN)
2257			ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
2258		else
2259			ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
2260	}
2261	if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2262		/* FIXME 1 would be enough in some cases (3 or less input vertices) */
2263		ctx.file_offset[TGSI_FILE_INPUT] = 2;
2264	}
2265	ctx.use_llvm = use_llvm;
2266
2267	if (use_llvm) {
2268		ctx.file_offset[TGSI_FILE_OUTPUT] =
2269			ctx.file_offset[TGSI_FILE_INPUT];
2270	} else {
2271	   ctx.file_offset[TGSI_FILE_OUTPUT] =
2272			ctx.file_offset[TGSI_FILE_INPUT] +
2273			ctx.info.file_max[TGSI_FILE_INPUT] + 1;
2274	}
2275	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
2276						ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
2277
2278	/* Outside the GPR range. This will be translated to one of the
2279	 * kcache banks later. */
2280	ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
2281
2282	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
2283	ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
2284			ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
2285	ctx.bc->index_reg[0] = ctx.bc->ar_reg + 1;
2286	ctx.bc->index_reg[1] = ctx.bc->ar_reg + 2;
2287
2288	if (ctx.type == TGSI_PROCESSOR_TESS_CTRL) {
2289		ctx.tess_input_info = ctx.bc->ar_reg + 3;
2290		ctx.tess_output_info = ctx.bc->ar_reg + 4;
2291		ctx.temp_reg = ctx.bc->ar_reg + 5;
2292	} else if (ctx.type == TGSI_PROCESSOR_TESS_EVAL) {
2293		ctx.tess_input_info = 0;
2294		ctx.tess_output_info = ctx.bc->ar_reg + 3;
2295		ctx.temp_reg = ctx.bc->ar_reg + 4;
2296	} else if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2297		ctx.gs_export_gpr_tregs[0] = ctx.bc->ar_reg + 3;
2298		ctx.gs_export_gpr_tregs[1] = ctx.bc->ar_reg + 4;
2299		ctx.gs_export_gpr_tregs[2] = ctx.bc->ar_reg + 5;
2300		ctx.gs_export_gpr_tregs[3] = ctx.bc->ar_reg + 6;
2301		ctx.temp_reg = ctx.bc->ar_reg + 7;
2302	} else {
2303		ctx.temp_reg = ctx.bc->ar_reg + 3;
2304	}
2305
2306	shader->max_arrays = 0;
2307	shader->num_arrays = 0;
2308	if (indirect_gprs) {
2309
2310		if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
2311			r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
2312			                   ctx.file_offset[TGSI_FILE_OUTPUT] -
2313			                   ctx.file_offset[TGSI_FILE_INPUT],
2314			                   0x0F);
2315		}
2316		if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
2317			r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
2318			                   ctx.file_offset[TGSI_FILE_TEMPORARY] -
2319			                   ctx.file_offset[TGSI_FILE_OUTPUT],
2320			                   0x0F);
2321		}
2322	}
2323
2324	ctx.nliterals = 0;
2325	ctx.literals = NULL;
2326
2327	shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS];
2328	shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
2329	shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
2330
2331	if (shader->vs_as_gs_a)
2332		vs_add_primid_output(&ctx, key.vs.prim_id_out);
2333
2334	if (ctx.type == TGSI_PROCESSOR_TESS_EVAL)
2335		r600_fetch_tess_io_info(&ctx);
2336
2337	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
2338		tgsi_parse_token(&ctx.parse);
2339		switch (ctx.parse.FullToken.Token.Type) {
2340		case TGSI_TOKEN_TYPE_IMMEDIATE:
2341			immediate = &ctx.parse.FullToken.FullImmediate;
2342			ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
2343			if(ctx.literals == NULL) {
2344				r = -ENOMEM;
2345				goto out_err;
2346			}
2347			ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
2348			ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
2349			ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
2350			ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
2351			ctx.nliterals++;
2352			break;
2353		case TGSI_TOKEN_TYPE_DECLARATION:
2354			r = tgsi_declaration(&ctx);
2355			if (r)
2356				goto out_err;
2357			break;
2358		case TGSI_TOKEN_TYPE_INSTRUCTION:
2359		case TGSI_TOKEN_TYPE_PROPERTY:
2360			break;
2361		default:
2362			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
2363			r = -EINVAL;
2364			goto out_err;
2365		}
2366	}
2367
2368	shader->ring_item_sizes[0] = ctx.next_ring_offset;
2369	shader->ring_item_sizes[1] = 0;
2370	shader->ring_item_sizes[2] = 0;
2371	shader->ring_item_sizes[3] = 0;
2372
2373	/* Process two side if needed */
2374	if (shader->two_side && ctx.colors_used) {
2375		int i, count = ctx.shader->ninput;
2376		unsigned next_lds_loc = ctx.shader->nlds;
2377
2378		/* additional inputs will be allocated right after the existing inputs,
2379		 * we won't need them after the color selection, so we don't need to
2380		 * reserve these gprs for the rest of the shader code and to adjust
2381		 * output offsets etc. */
2382		int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
2383				ctx.info.file_max[TGSI_FILE_INPUT] + 1;
2384
2385		/* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
2386		if (ctx.face_gpr == -1) {
2387			i = ctx.shader->ninput++;
2388			ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
2389			ctx.shader->input[i].spi_sid = 0;
2390			ctx.shader->input[i].gpr = gpr++;
2391			ctx.face_gpr = ctx.shader->input[i].gpr;
2392		}
2393
2394		for (i = 0; i < count; i++) {
2395			if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
2396				int ni = ctx.shader->ninput++;
2397				memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
2398				ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
2399				ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
2400				ctx.shader->input[ni].gpr = gpr++;
2401				// TGSI to LLVM needs to know the lds position of inputs.
2402				// Non LLVM path computes it later (in process_twoside_color)
2403				ctx.shader->input[ni].lds_pos = next_lds_loc++;
2404				ctx.shader->input[i].back_color_input = ni;
2405				if (ctx.bc->chip_class >= EVERGREEN) {
2406					if ((r = evergreen_interp_input(&ctx, ni)))
2407						return r;
2408				}
2409			}
2410		}
2411	}
2412
2413/* LLVM backend setup */
2414#ifdef R600_USE_LLVM
2415	if (use_llvm) {
2416		struct radeon_llvm_context radeon_llvm_ctx;
2417		LLVMModuleRef mod;
2418		bool dump = r600_can_dump_shader(&rscreen->b, tokens);
2419		boolean use_kill = false;
2420
2421		memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
2422		radeon_llvm_ctx.type = ctx.type;
2423		radeon_llvm_ctx.two_side = shader->two_side;
2424		radeon_llvm_ctx.face_gpr = ctx.face_gpr;
2425		radeon_llvm_ctx.inputs_count = ctx.shader->ninput + 1;
2426		radeon_llvm_ctx.r600_inputs = ctx.shader->input;
2427		radeon_llvm_ctx.r600_outputs = ctx.shader->output;
2428		radeon_llvm_ctx.color_buffer_count = max_color_exports;
2429		radeon_llvm_ctx.chip_class = ctx.bc->chip_class;
2430		radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN);
2431		radeon_llvm_ctx.stream_outputs = &so;
2432		radeon_llvm_ctx.alpha_to_one = key.ps.alpha_to_one;
2433		radeon_llvm_ctx.has_compressed_msaa_texturing =
2434			ctx.bc->has_compressed_msaa_texturing;
2435		mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
2436		ctx.shader->has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp;
2437		ctx.shader->uses_tex_buffers = radeon_llvm_ctx.uses_tex_buffers;
2438
2439		if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill, dump)) {
2440			radeon_llvm_dispose(&radeon_llvm_ctx);
2441			use_llvm = 0;
2442			fprintf(stderr, "R600 LLVM backend failed to compile "
2443				"shader.  Falling back to TGSI\n");
2444		} else {
2445			ctx.file_offset[TGSI_FILE_OUTPUT] =
2446					ctx.file_offset[TGSI_FILE_INPUT];
2447		}
2448		if (use_kill)
2449			ctx.shader->uses_kill = use_kill;
2450		radeon_llvm_dispose(&radeon_llvm_ctx);
2451	}
2452#endif
2453/* End of LLVM backend setup */
2454
2455	if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
2456		shader->nr_ps_max_color_exports = 8;
2457
2458	if (!use_llvm) {
2459		if (ctx.fragcoord_input >= 0) {
2460			if (ctx.bc->chip_class == CAYMAN) {
2461				for (j = 0 ; j < 4; j++) {
2462					struct r600_bytecode_alu alu;
2463					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2464					alu.op = ALU_OP1_RECIP_IEEE;
2465					alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
2466					alu.src[0].chan = 3;
2467
2468					alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
2469					alu.dst.chan = j;
2470					alu.dst.write = (j == 3);
2471					alu.last = 1;
2472					if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
2473						return r;
2474				}
2475			} else {
2476				struct r600_bytecode_alu alu;
2477				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2478				alu.op = ALU_OP1_RECIP_IEEE;
2479				alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
2480				alu.src[0].chan = 3;
2481
2482				alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
2483				alu.dst.chan = 3;
2484				alu.dst.write = 1;
2485				alu.last = 1;
2486				if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
2487					return r;
2488			}
2489		}
2490
2491		if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2492			struct r600_bytecode_alu alu;
2493			int r;
2494
2495			/* GS thread with no output workaround - emit a cut at start of GS */
2496			if (ctx.bc->chip_class == R600)
2497				r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
2498
2499			for (j = 0; j < 4; j++) {
2500				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2501				alu.op = ALU_OP1_MOV;
2502				alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
2503				alu.src[0].value = 0;
2504				alu.dst.sel = ctx.gs_export_gpr_tregs[j];
2505				alu.dst.write = 1;
2506				alu.last = 1;
2507				r = r600_bytecode_add_alu(ctx.bc, &alu);
2508				if (r)
2509					return r;
2510			}
2511		}
2512
2513		if (ctx.type == TGSI_PROCESSOR_TESS_CTRL)
2514			r600_fetch_tess_io_info(&ctx);
2515
2516		if (shader->two_side && ctx.colors_used) {
2517			if ((r = process_twoside_color_inputs(&ctx)))
2518				return r;
2519		}
2520
2521		tgsi_parse_init(&ctx.parse, tokens);
2522		while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
2523			tgsi_parse_token(&ctx.parse);
2524			switch (ctx.parse.FullToken.Token.Type) {
2525			case TGSI_TOKEN_TYPE_INSTRUCTION:
2526				r = tgsi_is_supported(&ctx);
2527				if (r)
2528					goto out_err;
2529				ctx.max_driver_temp_used = 0;
2530				/* reserve first tmp for everyone */
2531				r600_get_temp(&ctx);
2532
2533				opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
2534				if ((r = tgsi_split_constant(&ctx)))
2535					goto out_err;
2536				if ((r = tgsi_split_literal_constant(&ctx)))
2537					goto out_err;
2538				if (ctx.type == TGSI_PROCESSOR_GEOMETRY)
2539					if ((r = tgsi_split_gs_inputs(&ctx)))
2540						goto out_err;
2541				if (ctx.bc->chip_class == CAYMAN)
2542					ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
2543				else if (ctx.bc->chip_class >= EVERGREEN)
2544					ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
2545				else
2546					ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
2547				r = ctx.inst_info->process(&ctx);
2548				if (r)
2549					goto out_err;
2550				break;
2551			default:
2552				break;
2553			}
2554		}
2555	}
2556
2557	/* Reset the temporary register counter. */
2558	ctx.max_driver_temp_used = 0;
2559
2560	noutput = shader->noutput;
2561
2562	if (!ring_outputs && ctx.clip_vertex_write) {
2563		unsigned clipdist_temp[2];
2564
2565		clipdist_temp[0] = r600_get_temp(&ctx);
2566		clipdist_temp[1] = r600_get_temp(&ctx);
2567
2568		/* need to convert a clipvertex write into clipdistance writes and not export
2569		   the clip vertex anymore */
2570
2571		memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
2572		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
2573		shader->output[noutput].gpr = clipdist_temp[0];
2574		noutput++;
2575		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
2576		shader->output[noutput].gpr = clipdist_temp[1];
2577		noutput++;
2578
2579		/* reset spi_sid for clipvertex output to avoid confusing spi */
2580		shader->output[ctx.cv_output].spi_sid = 0;
2581
2582		shader->clip_dist_write = 0xFF;
2583
2584		for (i = 0; i < 8; i++) {
2585			int oreg = i >> 2;
2586			int ochan = i & 3;
2587
2588			for (j = 0; j < 4; j++) {
2589				struct r600_bytecode_alu alu;
2590				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2591				alu.op = ALU_OP2_DOT4;
2592				alu.src[0].sel = shader->output[ctx.cv_output].gpr;
2593				alu.src[0].chan = j;
2594
2595				alu.src[1].sel = 512 + i;
2596				alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
2597				alu.src[1].chan = j;
2598
2599				alu.dst.sel = clipdist_temp[oreg];
2600				alu.dst.chan = j;
2601				alu.dst.write = (j == ochan);
2602				if (j == 3)
2603					alu.last = 1;
2604				if (!use_llvm)
2605					r = r600_bytecode_add_alu(ctx.bc, &alu);
2606				if (r)
2607					return r;
2608			}
2609		}
2610	}
2611
2612	/* Add stream outputs. */
2613	if (!ring_outputs && ctx.type == TGSI_PROCESSOR_VERTEX &&
2614	    so.num_outputs && !use_llvm)
2615		emit_streamout(&ctx, &so, -1, NULL);
2616
2617	pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
2618	convert_edgeflag_to_int(&ctx);
2619
2620	if (ring_outputs) {
2621		if (shader->vs_as_es || shader->tes_as_es) {
2622			ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
2623			ctx.gs_export_gpr_tregs[1] = -1;
2624			ctx.gs_export_gpr_tregs[2] = -1;
2625			ctx.gs_export_gpr_tregs[3] = -1;
2626
2627			emit_gs_ring_writes(&ctx, &so, -1, FALSE);
2628		}
2629	} else {
2630		/* Export output */
2631		next_clip_base = shader->vs_out_misc_write ? 62 : 61;
2632
2633		for (i = 0, j = 0; i < noutput; i++, j++) {
2634			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2635			output[j].gpr = shader->output[i].gpr;
2636			output[j].elem_size = 3;
2637			output[j].swizzle_x = 0;
2638			output[j].swizzle_y = 1;
2639			output[j].swizzle_z = 2;
2640			output[j].swizzle_w = 3;
2641			output[j].burst_count = 1;
2642			output[j].type = -1;
2643			output[j].op = CF_OP_EXPORT;
2644			switch (ctx.type) {
2645			case TGSI_PROCESSOR_VERTEX:
2646				switch (shader->output[i].name) {
2647				case TGSI_SEMANTIC_POSITION:
2648					output[j].array_base = 60;
2649					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2650					pos_emitted = true;
2651					break;
2652
2653				case TGSI_SEMANTIC_PSIZE:
2654					output[j].array_base = 61;
2655					output[j].swizzle_y = 7;
2656					output[j].swizzle_z = 7;
2657					output[j].swizzle_w = 7;
2658					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2659					pos_emitted = true;
2660					break;
2661				case TGSI_SEMANTIC_EDGEFLAG:
2662					output[j].array_base = 61;
2663					output[j].swizzle_x = 7;
2664					output[j].swizzle_y = 0;
2665					output[j].swizzle_z = 7;
2666					output[j].swizzle_w = 7;
2667					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2668					pos_emitted = true;
2669					break;
2670				case TGSI_SEMANTIC_LAYER:
2671					/* spi_sid is 0 for outputs that are
2672					 * not consumed by PS */
2673					if (shader->output[i].spi_sid) {
2674						output[j].array_base = next_param_base++;
2675						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2676						j++;
2677						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2678					}
2679					output[j].array_base = 61;
2680					output[j].swizzle_x = 7;
2681					output[j].swizzle_y = 7;
2682					output[j].swizzle_z = 0;
2683					output[j].swizzle_w = 7;
2684					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2685					pos_emitted = true;
2686					break;
2687				case TGSI_SEMANTIC_VIEWPORT_INDEX:
2688					/* spi_sid is 0 for outputs that are
2689					 * not consumed by PS */
2690					if (shader->output[i].spi_sid) {
2691						output[j].array_base = next_param_base++;
2692						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2693						j++;
2694						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2695					}
2696					output[j].array_base = 61;
2697					output[j].swizzle_x = 7;
2698					output[j].swizzle_y = 7;
2699					output[j].swizzle_z = 7;
2700					output[j].swizzle_w = 0;
2701					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2702					pos_emitted = true;
2703					break;
2704				case TGSI_SEMANTIC_CLIPVERTEX:
2705					j--;
2706					break;
2707				case TGSI_SEMANTIC_CLIPDIST:
2708					output[j].array_base = next_clip_base++;
2709					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2710					pos_emitted = true;
2711					/* spi_sid is 0 for clipdistance outputs that were generated
2712					 * for clipvertex - we don't need to pass them to PS */
2713					if (shader->output[i].spi_sid) {
2714						j++;
2715						/* duplicate it as PARAM to pass to the pixel shader */
2716						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2717						output[j].array_base = next_param_base++;
2718						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2719					}
2720					break;
2721				case TGSI_SEMANTIC_FOG:
2722					output[j].swizzle_y = 4; /* 0 */
2723					output[j].swizzle_z = 4; /* 0 */
2724					output[j].swizzle_w = 5; /* 1 */
2725					break;
2726				case TGSI_SEMANTIC_PRIMID:
2727					output[j].swizzle_x = 2;
2728					output[j].swizzle_y = 4; /* 0 */
2729					output[j].swizzle_z = 4; /* 0 */
2730					output[j].swizzle_w = 4; /* 0 */
2731					break;
2732				}
2733
2734				break;
2735			case TGSI_PROCESSOR_FRAGMENT:
2736				if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
2737					/* never export more colors than the number of CBs */
2738					if (shader->output[i].sid >= max_color_exports) {
2739						/* skip export */
2740						j--;
2741						continue;
2742					}
2743					output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
2744					output[j].array_base = shader->output[i].sid;
2745					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2746					shader->nr_ps_color_exports++;
2747					if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
2748						for (k = 1; k < max_color_exports; k++) {
2749							j++;
2750							memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2751							output[j].gpr = shader->output[i].gpr;
2752							output[j].elem_size = 3;
2753							output[j].swizzle_x = 0;
2754							output[j].swizzle_y = 1;
2755							output[j].swizzle_z = 2;
2756							output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
2757							output[j].burst_count = 1;
2758							output[j].array_base = k;
2759							output[j].op = CF_OP_EXPORT;
2760							output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2761							shader->nr_ps_color_exports++;
2762						}
2763					}
2764				} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
2765					output[j].array_base = 61;
2766					output[j].swizzle_x = 2;
2767					output[j].swizzle_y = 7;
2768					output[j].swizzle_z = output[j].swizzle_w = 7;
2769					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2770				} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
2771					output[j].array_base = 61;
2772					output[j].swizzle_x = 7;
2773					output[j].swizzle_y = 1;
2774					output[j].swizzle_z = output[j].swizzle_w = 7;
2775					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2776				} else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
2777					output[j].array_base = 61;
2778					output[j].swizzle_x = 7;
2779					output[j].swizzle_y = 7;
2780					output[j].swizzle_z = 0;
2781					output[j].swizzle_w = 7;
2782					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2783				} else {
2784					R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
2785					r = -EINVAL;
2786					goto out_err;
2787				}
2788				break;
2789			default:
2790				R600_ERR("unsupported processor type %d\n", ctx.type);
2791				r = -EINVAL;
2792				goto out_err;
2793			}
2794
2795			if (output[j].type==-1) {
2796				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2797				output[j].array_base = next_param_base++;
2798			}
2799		}
2800
2801		/* add fake position export */
2802		if (ctx.type == TGSI_PROCESSOR_VERTEX && pos_emitted == false) {
2803			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2804			output[j].gpr = 0;
2805			output[j].elem_size = 3;
2806			output[j].swizzle_x = 7;
2807			output[j].swizzle_y = 7;
2808			output[j].swizzle_z = 7;
2809			output[j].swizzle_w = 7;
2810			output[j].burst_count = 1;
2811			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2812			output[j].array_base = 60;
2813			output[j].op = CF_OP_EXPORT;
2814			j++;
2815		}
2816
2817		/* add fake param output for vertex shader if no param is exported */
2818		if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) {
2819			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2820			output[j].gpr = 0;
2821			output[j].elem_size = 3;
2822			output[j].swizzle_x = 7;
2823			output[j].swizzle_y = 7;
2824			output[j].swizzle_z = 7;
2825			output[j].swizzle_w = 7;
2826			output[j].burst_count = 1;
2827			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2828			output[j].array_base = 0;
2829			output[j].op = CF_OP_EXPORT;
2830			j++;
2831		}
2832
2833		/* add fake pixel export */
2834		if (ctx.type == TGSI_PROCESSOR_FRAGMENT && shader->nr_ps_color_exports == 0) {
2835			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2836			output[j].gpr = 0;
2837			output[j].elem_size = 3;
2838			output[j].swizzle_x = 7;
2839			output[j].swizzle_y = 7;
2840			output[j].swizzle_z = 7;
2841			output[j].swizzle_w = 7;
2842			output[j].burst_count = 1;
2843			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2844			output[j].array_base = 0;
2845			output[j].op = CF_OP_EXPORT;
2846			j++;
2847			shader->nr_ps_color_exports++;
2848		}
2849
2850		noutput = j;
2851
2852		/* set export done on last export of each type */
2853		for (i = noutput - 1, output_done = 0; i >= 0; i--) {
2854			if (!(output_done & (1 << output[i].type))) {
2855				output_done |= (1 << output[i].type);
2856				output[i].op = CF_OP_EXPORT_DONE;
2857			}
2858		}
2859		/* add output to bytecode */
2860		if (!use_llvm) {
2861			for (i = 0; i < noutput; i++) {
2862				r = r600_bytecode_add_output(ctx.bc, &output[i]);
2863				if (r)
2864					goto out_err;
2865			}
2866		}
2867	}
2868
2869	/* add program end */
2870	if (!use_llvm) {
2871		if (ctx.bc->chip_class == CAYMAN)
2872			cm_bytecode_add_cf_end(ctx.bc);
2873		else {
2874			const struct cf_op_info *last = NULL;
2875
2876			if (ctx.bc->cf_last)
2877				last = r600_isa_cf(ctx.bc->cf_last->op);
2878
2879			/* alu clause instructions don't have EOP bit, so add NOP */
2880			if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_CALL_FS)
2881				r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2882
2883			ctx.bc->cf_last->end_of_program = 1;
2884		}
2885	}
2886
2887	/* check GPR limit - we have 124 = 128 - 4
2888	 * (4 are reserved as alu clause temporary registers) */
2889	if (ctx.bc->ngpr > 124) {
2890		R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
2891		r = -ENOMEM;
2892		goto out_err;
2893	}
2894
2895	if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2896		if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
2897			return r;
2898	}
2899
2900	free(ctx.literals);
2901	tgsi_parse_free(&ctx.parse);
2902	return 0;
2903out_err:
2904	free(ctx.literals);
2905	tgsi_parse_free(&ctx.parse);
2906	return r;
2907}
2908
2909static int tgsi_unsupported(struct r600_shader_ctx *ctx)
2910{
2911	const unsigned tgsi_opcode =
2912		ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
2913	R600_ERR("%s tgsi opcode unsupported\n",
2914		 tgsi_get_opcode_name(tgsi_opcode));
2915	return -EINVAL;
2916}
2917
2918static int tgsi_end(struct r600_shader_ctx *ctx)
2919{
2920	return 0;
2921}
2922
2923static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
2924			const struct r600_shader_src *shader_src,
2925			unsigned chan)
2926{
2927	bc_src->sel = shader_src->sel;
2928	bc_src->chan = shader_src->swizzle[chan];
2929	bc_src->neg = shader_src->neg;
2930	bc_src->abs = shader_src->abs;
2931	bc_src->rel = shader_src->rel;
2932	bc_src->value = shader_src->value[bc_src->chan];
2933	bc_src->kc_bank = shader_src->kc_bank;
2934	bc_src->kc_rel = shader_src->kc_rel;
2935}
2936
2937static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
2938{
2939	bc_src->abs = 1;
2940	bc_src->neg = 0;
2941}
2942
2943static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
2944{
2945	bc_src->neg = !bc_src->neg;
2946}
2947
2948static void tgsi_dst(struct r600_shader_ctx *ctx,
2949		     const struct tgsi_full_dst_register *tgsi_dst,
2950		     unsigned swizzle,
2951		     struct r600_bytecode_alu_dst *r600_dst)
2952{
2953	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2954
2955	r600_dst->sel = tgsi_dst->Register.Index;
2956	r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
2957	r600_dst->chan = swizzle;
2958	r600_dst->write = 1;
2959	if (tgsi_dst->Register.Indirect)
2960		r600_dst->rel = V_SQ_REL_RELATIVE;
2961	if (inst->Instruction.Saturate) {
2962		r600_dst->clamp = 1;
2963	}
2964}
2965
2966static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap)
2967{
2968	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2969	unsigned write_mask = inst->Dst[0].Register.WriteMask;
2970	struct r600_bytecode_alu alu;
2971	int i, j, r, lasti = tgsi_last_instruction(write_mask);
2972	int use_tmp = 0;
2973
2974	if (singledest) {
2975		switch (write_mask) {
2976		case 0x1:
2977			write_mask = 0x3;
2978			break;
2979		case 0x2:
2980			use_tmp = 1;
2981			write_mask = 0x3;
2982			break;
2983		case 0x4:
2984			write_mask = 0xc;
2985			break;
2986		case 0x8:
2987			write_mask = 0xc;
2988			use_tmp = 3;
2989			break;
2990		}
2991	}
2992
2993	lasti = tgsi_last_instruction(write_mask);
2994	for (i = 0; i <= lasti; i++) {
2995
2996		if (!(write_mask & (1 << i)))
2997			continue;
2998
2999		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3000
3001		if (singledest) {
3002			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3003			if (use_tmp) {
3004				alu.dst.sel = ctx->temp_reg;
3005				alu.dst.chan = i;
3006				alu.dst.write = 1;
3007			}
3008			if (i == 1 || i == 3)
3009				alu.dst.write = 0;
3010		} else
3011			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3012
3013		alu.op = ctx->inst_info->op;
3014		if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
3015			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3016		} else if (!swap) {
3017			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3018				r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
3019			}
3020		} else {
3021			r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
3022			r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
3023		}
3024
3025		/* handle some special cases */
3026		if (i == 1 || i == 3) {
3027			switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
3028			case TGSI_OPCODE_SUB:
3029				r600_bytecode_src_toggle_neg(&alu.src[1]);
3030				break;
3031			case TGSI_OPCODE_DABS:
3032				r600_bytecode_src_set_abs(&alu.src[0]);
3033				break;
3034			default:
3035				break;
3036			}
3037		}
3038		if (i == lasti) {
3039			alu.last = 1;
3040		}
3041		r = r600_bytecode_add_alu(ctx->bc, &alu);
3042		if (r)
3043			return r;
3044	}
3045
3046	if (use_tmp) {
3047		write_mask = inst->Dst[0].Register.WriteMask;
3048
3049		/* move result from temp to dst */
3050		for (i = 0; i <= lasti; i++) {
3051			if (!(write_mask & (1 << i)))
3052				continue;
3053
3054			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3055			alu.op = ALU_OP1_MOV;
3056			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3057			alu.src[0].sel = ctx->temp_reg;
3058			alu.src[0].chan = use_tmp - 1;
3059			alu.last = (i == lasti);
3060
3061			r = r600_bytecode_add_alu(ctx->bc, &alu);
3062			if (r)
3063				return r;
3064		}
3065	}
3066	return 0;
3067}
3068
3069static int tgsi_op2_64(struct r600_shader_ctx *ctx)
3070{
3071	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3072	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3073	/* confirm writemasking */
3074	if ((write_mask & 0x3) != 0x3 &&
3075	    (write_mask & 0xc) != 0xc) {
3076		fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
3077		return -1;
3078	}
3079	return tgsi_op2_64_params(ctx, false, false);
3080}
3081
3082static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
3083{
3084	return tgsi_op2_64_params(ctx, true, false);
3085}
3086
3087static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
3088{
3089	return tgsi_op2_64_params(ctx, true, true);
3090}
3091
3092static int tgsi_op3_64(struct r600_shader_ctx *ctx)
3093{
3094	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3095	struct r600_bytecode_alu alu;
3096	int i, j, r;
3097	int lasti = 3;
3098	int tmp = r600_get_temp(ctx);
3099
3100	for (i = 0; i < lasti + 1; i++) {
3101
3102		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3103		alu.op = ctx->inst_info->op;
3104		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3105			r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
3106		}
3107
3108		if (inst->Dst[0].Register.WriteMask & (1 << i))
3109			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3110		else
3111			alu.dst.sel = tmp;
3112
3113		alu.dst.chan = i;
3114		alu.is_op3 = 1;
3115		if (i == lasti) {
3116			alu.last = 1;
3117		}
3118		r = r600_bytecode_add_alu(ctx->bc, &alu);
3119		if (r)
3120			return r;
3121	}
3122	return 0;
3123}
3124
3125static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
3126{
3127	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3128	struct r600_bytecode_alu alu;
3129	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3130	int i, j, r, lasti = tgsi_last_instruction(write_mask);
3131	/* use temp register if trans_only and more than one dst component */
3132	int use_tmp = trans_only && (write_mask ^ (1 << lasti));
3133
3134	for (i = 0; i <= lasti; i++) {
3135		if (!(write_mask & (1 << i)))
3136			continue;
3137
3138		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3139		if (use_tmp) {
3140			alu.dst.sel = ctx->temp_reg;
3141			alu.dst.chan = i;
3142			alu.dst.write = 1;
3143		} else
3144			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3145
3146		alu.op = ctx->inst_info->op;
3147		if (!swap) {
3148			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3149				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
3150			}
3151		} else {
3152			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3153			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3154		}
3155		/* handle some special cases */
3156		switch (inst->Instruction.Opcode) {
3157		case TGSI_OPCODE_SUB:
3158			r600_bytecode_src_toggle_neg(&alu.src[1]);
3159			break;
3160		case TGSI_OPCODE_ABS:
3161			r600_bytecode_src_set_abs(&alu.src[0]);
3162			break;
3163		default:
3164			break;
3165		}
3166		if (i == lasti || trans_only) {
3167			alu.last = 1;
3168		}
3169		r = r600_bytecode_add_alu(ctx->bc, &alu);
3170		if (r)
3171			return r;
3172	}
3173
3174	if (use_tmp) {
3175		/* move result from temp to dst */
3176		for (i = 0; i <= lasti; i++) {
3177			if (!(write_mask & (1 << i)))
3178				continue;
3179
3180			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3181			alu.op = ALU_OP1_MOV;
3182			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3183			alu.src[0].sel = ctx->temp_reg;
3184			alu.src[0].chan = i;
3185			alu.last = (i == lasti);
3186
3187			r = r600_bytecode_add_alu(ctx->bc, &alu);
3188			if (r)
3189				return r;
3190		}
3191	}
3192	return 0;
3193}
3194
3195static int tgsi_op2(struct r600_shader_ctx *ctx)
3196{
3197	return tgsi_op2_s(ctx, 0, 0);
3198}
3199
3200static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
3201{
3202	return tgsi_op2_s(ctx, 1, 0);
3203}
3204
3205static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
3206{
3207	return tgsi_op2_s(ctx, 0, 1);
3208}
3209
3210static int tgsi_ineg(struct r600_shader_ctx *ctx)
3211{
3212	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3213	struct r600_bytecode_alu alu;
3214	int i, r;
3215	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3216
3217	for (i = 0; i < lasti + 1; i++) {
3218
3219		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3220			continue;
3221		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3222		alu.op = ctx->inst_info->op;
3223
3224		alu.src[0].sel = V_SQ_ALU_SRC_0;
3225
3226		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3227
3228		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3229
3230		if (i == lasti) {
3231			alu.last = 1;
3232		}
3233		r = r600_bytecode_add_alu(ctx->bc, &alu);
3234		if (r)
3235			return r;
3236	}
3237	return 0;
3238
3239}
3240
3241static int tgsi_dneg(struct r600_shader_ctx *ctx)
3242{
3243	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3244	struct r600_bytecode_alu alu;
3245	int i, r;
3246	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3247
3248	for (i = 0; i < lasti + 1; i++) {
3249
3250		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3251			continue;
3252		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3253		alu.op = ALU_OP1_MOV;
3254
3255		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3256
3257		if (i == 1 || i == 3)
3258			r600_bytecode_src_toggle_neg(&alu.src[0]);
3259		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3260
3261		if (i == lasti) {
3262			alu.last = 1;
3263		}
3264		r = r600_bytecode_add_alu(ctx->bc, &alu);
3265		if (r)
3266			return r;
3267	}
3268	return 0;
3269
3270}
3271
3272static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
3273{
3274	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3275	struct r600_bytecode_alu alu;
3276	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3277	int i, j, r;
3278	int firsti = write_mask == 0xc ? 2 : 0;
3279
3280	for (i = 0; i <= 3; i++) {
3281		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3282		alu.op = ctx->inst_info->op;
3283
3284		alu.dst.sel = ctx->temp_reg;
3285		alu.dst.chan = i;
3286		alu.dst.write = 1;
3287		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3288			r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
3289		}
3290
3291		if (i == 3)
3292			alu.last = 1;
3293
3294		r = r600_bytecode_add_alu(ctx->bc, &alu);
3295		if (r)
3296			return r;
3297	}
3298
3299	/* MOV first two channels to writemask dst0 */
3300	for (i = 0; i <= 1; i++) {
3301		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3302		alu.op = ALU_OP1_MOV;
3303		alu.src[0].chan = i + 2;
3304		alu.src[0].sel = ctx->temp_reg;
3305
3306		tgsi_dst(ctx, &inst->Dst[0], firsti + i, &alu.dst);
3307		alu.dst.write = (inst->Dst[0].Register.WriteMask >> (firsti + i)) & 1;
3308		alu.last = 1;
3309		r = r600_bytecode_add_alu(ctx->bc, &alu);
3310		if (r)
3311			return r;
3312	}
3313
3314	for (i = 0; i <= 3; i++) {
3315		if (inst->Dst[1].Register.WriteMask & (1 << i)) {
3316			/* MOV third channels to writemask dst1 */
3317			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3318			alu.op = ALU_OP1_MOV;
3319			alu.src[0].chan = 1;
3320			alu.src[0].sel = ctx->temp_reg;
3321
3322			tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
3323			alu.last = 1;
3324			r = r600_bytecode_add_alu(ctx->bc, &alu);
3325			if (r)
3326				return r;
3327			break;
3328		}
3329	}
3330	return 0;
3331}
3332
3333
3334static int egcm_int_to_double(struct r600_shader_ctx *ctx)
3335{
3336	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3337	struct r600_bytecode_alu alu;
3338	int i, r;
3339	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3340
3341	assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
3342		inst->Instruction.Opcode == TGSI_OPCODE_U2D);
3343
3344	for (i = 0; i <= (lasti+1)/2; i++) {
3345		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3346		alu.op = ctx->inst_info->op;
3347
3348		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3349		alu.dst.sel = ctx->temp_reg;
3350		alu.dst.chan = i;
3351		alu.dst.write = 1;
3352		alu.last = 1;
3353
3354		r = r600_bytecode_add_alu(ctx->bc, &alu);
3355		if (r)
3356			return r;
3357	}
3358
3359	for (i = 0; i <= lasti; i++) {
3360		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3361		alu.op = ALU_OP1_FLT32_TO_FLT64;
3362
3363		alu.src[0].chan = i/2;
3364		if (i%2 == 0)
3365			alu.src[0].sel = ctx->temp_reg;
3366		else {
3367			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3368			alu.src[0].value = 0x0;
3369		}
3370		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3371		alu.last = i == lasti;
3372
3373		r = r600_bytecode_add_alu(ctx->bc, &alu);
3374		if (r)
3375			return r;
3376	}
3377
3378	return 0;
3379}
3380
3381static int egcm_double_to_int(struct r600_shader_ctx *ctx)
3382{
3383	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3384	struct r600_bytecode_alu alu;
3385	int i, r;
3386	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3387
3388	assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I ||
3389		inst->Instruction.Opcode == TGSI_OPCODE_D2U);
3390
3391	for (i = 0; i <= lasti; i++) {
3392		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3393		alu.op = ALU_OP1_FLT64_TO_FLT32;
3394
3395		r600_bytecode_src(&alu.src[0], &ctx->src[0], fp64_switch(i));
3396		alu.dst.chan = i;
3397		alu.dst.sel = ctx->temp_reg;
3398		alu.dst.write = i%2 == 0;
3399		alu.last = i == lasti;
3400
3401		r = r600_bytecode_add_alu(ctx->bc, &alu);
3402		if (r)
3403			return r;
3404	}
3405
3406	for (i = 0; i <= (lasti+1)/2; i++) {
3407		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3408		alu.op = ctx->inst_info->op;
3409
3410		alu.src[0].chan = i*2;
3411		alu.src[0].sel = ctx->temp_reg;
3412		tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
3413		alu.last = 1;
3414
3415		r = r600_bytecode_add_alu(ctx->bc, &alu);
3416		if (r)
3417			return r;
3418	}
3419
3420	return 0;
3421}
3422
3423static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
3424{
3425	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3426	int i, r;
3427	struct r600_bytecode_alu alu;
3428	int last_slot = 3;
3429	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3430	int t1 = ctx->temp_reg;
3431
3432	/* these have to write the result to X/Y by the looks of it */
3433	for (i = 0 ; i < last_slot; i++) {
3434		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3435		alu.op = ctx->inst_info->op;
3436
3437		/* should only be one src regs */
3438		assert (inst->Instruction.NumSrcRegs == 1);
3439
3440		r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
3441		r600_bytecode_src(&alu.src[1], &ctx->src[0], 0);
3442
3443		/* RSQ should take the absolute value of src */
3444		if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
3445		    ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT) {
3446			r600_bytecode_src_set_abs(&alu.src[1]);
3447		}
3448		alu.dst.sel = t1;
3449		alu.dst.chan = i;
3450		alu.dst.write = (i == 0 || i == 1);
3451
3452		if (ctx->bc->chip_class != CAYMAN || i == last_slot - 1)
3453			alu.last = 1;
3454		r = r600_bytecode_add_alu(ctx->bc, &alu);
3455		if (r)
3456			return r;
3457	}
3458
3459	for (i = 0 ; i <= lasti; i++) {
3460		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3461			continue;
3462		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3463		alu.op = ALU_OP1_MOV;
3464		alu.src[0].sel = t1;
3465		alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
3466		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3467		alu.dst.write = 1;
3468		if (i == lasti)
3469			alu.last = 1;
3470		r = r600_bytecode_add_alu(ctx->bc, &alu);
3471		if (r)
3472			return r;
3473	}
3474	return 0;
3475}
3476
3477static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
3478{
3479	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3480	int i, j, r;
3481	struct r600_bytecode_alu alu;
3482	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
3483
3484	for (i = 0 ; i < last_slot; i++) {
3485		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3486		alu.op = ctx->inst_info->op;
3487		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3488			r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
3489
3490			/* RSQ should take the absolute value of src */
3491			if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
3492				r600_bytecode_src_set_abs(&alu.src[j]);
3493			}
3494		}
3495		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3496		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3497
3498		if (i == last_slot - 1)
3499			alu.last = 1;
3500		r = r600_bytecode_add_alu(ctx->bc, &alu);
3501		if (r)
3502			return r;
3503	}
3504	return 0;
3505}
3506
3507static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
3508{
3509	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3510	int i, j, k, r;
3511	struct r600_bytecode_alu alu;
3512	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3513	int t1 = ctx->temp_reg;
3514
3515	for (k = 0; k <= lasti; k++) {
3516		if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
3517			continue;
3518
3519		for (i = 0 ; i < 4; i++) {
3520			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3521			alu.op = ctx->inst_info->op;
3522			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3523				r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
3524			}
3525			alu.dst.sel = t1;
3526			alu.dst.chan = i;
3527			alu.dst.write = (i == k);
3528			if (i == 3)
3529				alu.last = 1;
3530			r = r600_bytecode_add_alu(ctx->bc, &alu);
3531			if (r)
3532				return r;
3533		}
3534	}
3535
3536	for (i = 0 ; i <= lasti; i++) {
3537		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3538			continue;
3539		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3540		alu.op = ALU_OP1_MOV;
3541		alu.src[0].sel = t1;
3542		alu.src[0].chan = i;
3543		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3544		alu.dst.write = 1;
3545		if (i == lasti)
3546			alu.last = 1;
3547		r = r600_bytecode_add_alu(ctx->bc, &alu);
3548		if (r)
3549			return r;
3550	}
3551
3552	return 0;
3553}
3554
3555
3556static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
3557{
3558	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3559	int i, j, k, r;
3560	struct r600_bytecode_alu alu;
3561	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3562	int t1 = ctx->temp_reg;
3563
3564	for (k = 0; k < 2; k++) {
3565		if (!(inst->Dst[0].Register.WriteMask & (0x3 << (k * 2))))
3566			continue;
3567
3568		for (i = 0; i < 4; i++) {
3569			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3570			alu.op = ctx->inst_info->op;
3571			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3572				r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));;
3573			}
3574			alu.dst.sel = t1;
3575			alu.dst.chan = i;
3576			alu.dst.write = 1;
3577			if (i == 3)
3578				alu.last = 1;
3579			r = r600_bytecode_add_alu(ctx->bc, &alu);
3580			if (r)
3581				return r;
3582		}
3583	}
3584
3585	for (i = 0; i <= lasti; i++) {
3586		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3587			continue;
3588		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3589		alu.op = ALU_OP1_MOV;
3590		alu.src[0].sel = t1;
3591		alu.src[0].chan = i;
3592		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3593		alu.dst.write = 1;
3594		if (i == lasti)
3595			alu.last = 1;
3596		r = r600_bytecode_add_alu(ctx->bc, &alu);
3597		if (r)
3598			return r;
3599	}
3600
3601	return 0;
3602}
3603
3604/*
3605 * r600 - trunc to -PI..PI range
3606 * r700 - normalize by dividing by 2PI
3607 * see fdo bug 27901
3608 */
3609static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
3610{
3611	static float half_inv_pi = 1.0 /(3.1415926535 * 2);
3612	static float double_pi = 3.1415926535 * 2;
3613	static float neg_pi = -3.1415926535;
3614
3615	int r;
3616	struct r600_bytecode_alu alu;
3617
3618	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3619	alu.op = ALU_OP3_MULADD;
3620	alu.is_op3 = 1;
3621
3622	alu.dst.chan = 0;
3623	alu.dst.sel = ctx->temp_reg;
3624	alu.dst.write = 1;
3625
3626	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3627
3628	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3629	alu.src[1].chan = 0;
3630	alu.src[1].value = *(uint32_t *)&half_inv_pi;
3631	alu.src[2].sel = V_SQ_ALU_SRC_0_5;
3632	alu.src[2].chan = 0;
3633	alu.last = 1;
3634	r = r600_bytecode_add_alu(ctx->bc, &alu);
3635	if (r)
3636		return r;
3637
3638	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3639	alu.op = ALU_OP1_FRACT;
3640
3641	alu.dst.chan = 0;
3642	alu.dst.sel = ctx->temp_reg;
3643	alu.dst.write = 1;
3644
3645	alu.src[0].sel = ctx->temp_reg;
3646	alu.src[0].chan = 0;
3647	alu.last = 1;
3648	r = r600_bytecode_add_alu(ctx->bc, &alu);
3649	if (r)
3650		return r;
3651
3652	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3653	alu.op = ALU_OP3_MULADD;
3654	alu.is_op3 = 1;
3655
3656	alu.dst.chan = 0;
3657	alu.dst.sel = ctx->temp_reg;
3658	alu.dst.write = 1;
3659
3660	alu.src[0].sel = ctx->temp_reg;
3661	alu.src[0].chan = 0;
3662
3663	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3664	alu.src[1].chan = 0;
3665	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
3666	alu.src[2].chan = 0;
3667
3668	if (ctx->bc->chip_class == R600) {
3669		alu.src[1].value = *(uint32_t *)&double_pi;
3670		alu.src[2].value = *(uint32_t *)&neg_pi;
3671	} else {
3672		alu.src[1].sel = V_SQ_ALU_SRC_1;
3673		alu.src[2].sel = V_SQ_ALU_SRC_0_5;
3674		alu.src[2].neg = 1;
3675	}
3676
3677	alu.last = 1;
3678	r = r600_bytecode_add_alu(ctx->bc, &alu);
3679	if (r)
3680		return r;
3681	return 0;
3682}
3683
3684static int cayman_trig(struct r600_shader_ctx *ctx)
3685{
3686	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3687	struct r600_bytecode_alu alu;
3688	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
3689	int i, r;
3690
3691	r = tgsi_setup_trig(ctx);
3692	if (r)
3693		return r;
3694
3695
3696	for (i = 0; i < last_slot; i++) {
3697		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3698		alu.op = ctx->inst_info->op;
3699		alu.dst.chan = i;
3700
3701		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3702		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3703
3704		alu.src[0].sel = ctx->temp_reg;
3705		alu.src[0].chan = 0;
3706		if (i == last_slot - 1)
3707			alu.last = 1;
3708		r = r600_bytecode_add_alu(ctx->bc, &alu);
3709		if (r)
3710			return r;
3711	}
3712	return 0;
3713}
3714
3715static int tgsi_trig(struct r600_shader_ctx *ctx)
3716{
3717	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3718	struct r600_bytecode_alu alu;
3719	int i, r;
3720	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3721
3722	r = tgsi_setup_trig(ctx);
3723	if (r)
3724		return r;
3725
3726	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3727	alu.op = ctx->inst_info->op;
3728	alu.dst.chan = 0;
3729	alu.dst.sel = ctx->temp_reg;
3730	alu.dst.write = 1;
3731
3732	alu.src[0].sel = ctx->temp_reg;
3733	alu.src[0].chan = 0;
3734	alu.last = 1;
3735	r = r600_bytecode_add_alu(ctx->bc, &alu);
3736	if (r)
3737		return r;
3738
3739	/* replicate result */
3740	for (i = 0; i < lasti + 1; i++) {
3741		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3742			continue;
3743
3744		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3745		alu.op = ALU_OP1_MOV;
3746
3747		alu.src[0].sel = ctx->temp_reg;
3748		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3749		if (i == lasti)
3750			alu.last = 1;
3751		r = r600_bytecode_add_alu(ctx->bc, &alu);
3752		if (r)
3753			return r;
3754	}
3755	return 0;
3756}
3757
3758static int tgsi_scs(struct r600_shader_ctx *ctx)
3759{
3760	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3761	struct r600_bytecode_alu alu;
3762	int i, r;
3763
3764	/* We'll only need the trig stuff if we are going to write to the
3765	 * X or Y components of the destination vector.
3766	 */
3767	if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) {
3768		r = tgsi_setup_trig(ctx);
3769		if (r)
3770			return r;
3771	}
3772
3773	/* dst.x = COS */
3774	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3775		if (ctx->bc->chip_class == CAYMAN) {
3776			for (i = 0 ; i < 3; i++) {
3777				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3778				alu.op = ALU_OP1_COS;
3779				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3780
3781				if (i == 0)
3782					alu.dst.write = 1;
3783				else
3784					alu.dst.write = 0;
3785				alu.src[0].sel = ctx->temp_reg;
3786				alu.src[0].chan = 0;
3787				if (i == 2)
3788					alu.last = 1;
3789				r = r600_bytecode_add_alu(ctx->bc, &alu);
3790				if (r)
3791					return r;
3792			}
3793		} else {
3794			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3795			alu.op = ALU_OP1_COS;
3796			tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
3797
3798			alu.src[0].sel = ctx->temp_reg;
3799			alu.src[0].chan = 0;
3800			alu.last = 1;
3801			r = r600_bytecode_add_alu(ctx->bc, &alu);
3802			if (r)
3803				return r;
3804		}
3805	}
3806
3807	/* dst.y = SIN */
3808	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3809		if (ctx->bc->chip_class == CAYMAN) {
3810			for (i = 0 ; i < 3; i++) {
3811				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3812				alu.op = ALU_OP1_SIN;
3813				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3814				if (i == 1)
3815					alu.dst.write = 1;
3816				else
3817					alu.dst.write = 0;
3818				alu.src[0].sel = ctx->temp_reg;
3819				alu.src[0].chan = 0;
3820				if (i == 2)
3821					alu.last = 1;
3822				r = r600_bytecode_add_alu(ctx->bc, &alu);
3823				if (r)
3824					return r;
3825			}
3826		} else {
3827			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3828			alu.op = ALU_OP1_SIN;
3829			tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
3830
3831			alu.src[0].sel = ctx->temp_reg;
3832			alu.src[0].chan = 0;
3833			alu.last = 1;
3834			r = r600_bytecode_add_alu(ctx->bc, &alu);
3835			if (r)
3836				return r;
3837		}
3838	}
3839
3840	/* dst.z = 0.0; */
3841	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3842		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3843
3844		alu.op = ALU_OP1_MOV;
3845
3846		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
3847
3848		alu.src[0].sel = V_SQ_ALU_SRC_0;
3849		alu.src[0].chan = 0;
3850
3851		alu.last = 1;
3852
3853		r = r600_bytecode_add_alu(ctx->bc, &alu);
3854		if (r)
3855			return r;
3856	}
3857
3858	/* dst.w = 1.0; */
3859	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3860		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3861
3862		alu.op = ALU_OP1_MOV;
3863
3864		tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
3865
3866		alu.src[0].sel = V_SQ_ALU_SRC_1;
3867		alu.src[0].chan = 0;
3868
3869		alu.last = 1;
3870
3871		r = r600_bytecode_add_alu(ctx->bc, &alu);
3872		if (r)
3873			return r;
3874	}
3875
3876	return 0;
3877}
3878
3879static int tgsi_kill(struct r600_shader_ctx *ctx)
3880{
3881	const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3882	struct r600_bytecode_alu alu;
3883	int i, r;
3884
3885	for (i = 0; i < 4; i++) {
3886		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3887		alu.op = ctx->inst_info->op;
3888
3889		alu.dst.chan = i;
3890
3891		alu.src[0].sel = V_SQ_ALU_SRC_0;
3892
3893		if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
3894			alu.src[1].sel = V_SQ_ALU_SRC_1;
3895			alu.src[1].neg = 1;
3896		} else {
3897			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3898		}
3899		if (i == 3) {
3900			alu.last = 1;
3901		}
3902		r = r600_bytecode_add_alu(ctx->bc, &alu);
3903		if (r)
3904			return r;
3905	}
3906
3907	/* kill must be last in ALU */
3908	ctx->bc->force_add_cf = 1;
3909	ctx->shader->uses_kill = TRUE;
3910	return 0;
3911}
3912
3913static int tgsi_lit(struct r600_shader_ctx *ctx)
3914{
3915	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3916	struct r600_bytecode_alu alu;
3917	int r;
3918
3919	/* tmp.x = max(src.y, 0.0) */
3920	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3921	alu.op = ALU_OP2_MAX;
3922	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
3923	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
3924	alu.src[1].chan = 1;
3925
3926	alu.dst.sel = ctx->temp_reg;
3927	alu.dst.chan = 0;
3928	alu.dst.write = 1;
3929
3930	alu.last = 1;
3931	r = r600_bytecode_add_alu(ctx->bc, &alu);
3932	if (r)
3933		return r;
3934
3935	if (inst->Dst[0].Register.WriteMask & (1 << 2))
3936	{
3937		int chan;
3938		int sel;
3939		int i;
3940
3941		if (ctx->bc->chip_class == CAYMAN) {
3942			for (i = 0; i < 3; i++) {
3943				/* tmp.z = log(tmp.x) */
3944				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3945				alu.op = ALU_OP1_LOG_CLAMPED;
3946				alu.src[0].sel = ctx->temp_reg;
3947				alu.src[0].chan = 0;
3948				alu.dst.sel = ctx->temp_reg;
3949				alu.dst.chan = i;
3950				if (i == 2) {
3951					alu.dst.write = 1;
3952					alu.last = 1;
3953				} else
3954					alu.dst.write = 0;
3955
3956				r = r600_bytecode_add_alu(ctx->bc, &alu);
3957				if (r)
3958					return r;
3959			}
3960		} else {
3961			/* tmp.z = log(tmp.x) */
3962			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3963			alu.op = ALU_OP1_LOG_CLAMPED;
3964			alu.src[0].sel = ctx->temp_reg;
3965			alu.src[0].chan = 0;
3966			alu.dst.sel = ctx->temp_reg;
3967			alu.dst.chan = 2;
3968			alu.dst.write = 1;
3969			alu.last = 1;
3970			r = r600_bytecode_add_alu(ctx->bc, &alu);
3971			if (r)
3972				return r;
3973		}
3974
3975		chan = alu.dst.chan;
3976		sel = alu.dst.sel;
3977
3978		/* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
3979		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3980		alu.op = ALU_OP3_MUL_LIT;
3981		alu.src[0].sel  = sel;
3982		alu.src[0].chan = chan;
3983		r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
3984		r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
3985		alu.dst.sel = ctx->temp_reg;
3986		alu.dst.chan = 0;
3987		alu.dst.write = 1;
3988		alu.is_op3 = 1;
3989		alu.last = 1;
3990		r = r600_bytecode_add_alu(ctx->bc, &alu);
3991		if (r)
3992			return r;
3993
3994		if (ctx->bc->chip_class == CAYMAN) {
3995			for (i = 0; i < 3; i++) {
3996				/* dst.z = exp(tmp.x) */
3997				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3998				alu.op = ALU_OP1_EXP_IEEE;
3999				alu.src[0].sel = ctx->temp_reg;
4000				alu.src[0].chan = 0;
4001				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4002				if (i == 2) {
4003					alu.dst.write = 1;
4004					alu.last = 1;
4005				} else
4006					alu.dst.write = 0;
4007				r = r600_bytecode_add_alu(ctx->bc, &alu);
4008				if (r)
4009					return r;
4010			}
4011		} else {
4012			/* dst.z = exp(tmp.x) */
4013			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4014			alu.op = ALU_OP1_EXP_IEEE;
4015			alu.src[0].sel = ctx->temp_reg;
4016			alu.src[0].chan = 0;
4017			tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
4018			alu.last = 1;
4019			r = r600_bytecode_add_alu(ctx->bc, &alu);
4020			if (r)
4021				return r;
4022		}
4023	}
4024
4025	/* dst.x, <- 1.0  */
4026	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4027	alu.op = ALU_OP1_MOV;
4028	alu.src[0].sel  = V_SQ_ALU_SRC_1; /*1.0*/
4029	alu.src[0].chan = 0;
4030	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
4031	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
4032	r = r600_bytecode_add_alu(ctx->bc, &alu);
4033	if (r)
4034		return r;
4035
4036	/* dst.y = max(src.x, 0.0) */
4037	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4038	alu.op = ALU_OP2_MAX;
4039	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4040	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
4041	alu.src[1].chan = 0;
4042	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
4043	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
4044	r = r600_bytecode_add_alu(ctx->bc, &alu);
4045	if (r)
4046		return r;
4047
4048	/* dst.w, <- 1.0  */
4049	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4050	alu.op = ALU_OP1_MOV;
4051	alu.src[0].sel  = V_SQ_ALU_SRC_1;
4052	alu.src[0].chan = 0;
4053	tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
4054	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
4055	alu.last = 1;
4056	r = r600_bytecode_add_alu(ctx->bc, &alu);
4057	if (r)
4058		return r;
4059
4060	return 0;
4061}
4062
4063static int tgsi_rsq(struct r600_shader_ctx *ctx)
4064{
4065	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4066	struct r600_bytecode_alu alu;
4067	int i, r;
4068
4069	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4070
4071	/* XXX:
4072	 * For state trackers other than OpenGL, we'll want to use
4073	 * _RECIPSQRT_IEEE instead.
4074	 */
4075	alu.op = ALU_OP1_RECIPSQRT_CLAMPED;
4076
4077	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
4078		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
4079		r600_bytecode_src_set_abs(&alu.src[i]);
4080	}
4081	alu.dst.sel = ctx->temp_reg;
4082	alu.dst.write = 1;
4083	alu.last = 1;
4084	r = r600_bytecode_add_alu(ctx->bc, &alu);
4085	if (r)
4086		return r;
4087	/* replicate result */
4088	return tgsi_helper_tempx_replicate(ctx);
4089}
4090
4091static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
4092{
4093	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4094	struct r600_bytecode_alu alu;
4095	int i, r;
4096
4097	for (i = 0; i < 4; i++) {
4098		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4099		alu.src[0].sel = ctx->temp_reg;
4100		alu.op = ALU_OP1_MOV;
4101		alu.dst.chan = i;
4102		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4103		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4104		if (i == 3)
4105			alu.last = 1;
4106		r = r600_bytecode_add_alu(ctx->bc, &alu);
4107		if (r)
4108			return r;
4109	}
4110	return 0;
4111}
4112
4113static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
4114{
4115	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4116	struct r600_bytecode_alu alu;
4117	int i, r;
4118
4119	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4120	alu.op = ctx->inst_info->op;
4121	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
4122		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
4123	}
4124	alu.dst.sel = ctx->temp_reg;
4125	alu.dst.write = 1;
4126	alu.last = 1;
4127	r = r600_bytecode_add_alu(ctx->bc, &alu);
4128	if (r)
4129		return r;
4130	/* replicate result */
4131	return tgsi_helper_tempx_replicate(ctx);
4132}
4133
4134static int cayman_pow(struct r600_shader_ctx *ctx)
4135{
4136	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4137	int i, r;
4138	struct r600_bytecode_alu alu;
4139	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
4140
4141	for (i = 0; i < 3; i++) {
4142		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4143		alu.op = ALU_OP1_LOG_IEEE;
4144		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4145		alu.dst.sel = ctx->temp_reg;
4146		alu.dst.chan = i;
4147		alu.dst.write = 1;
4148		if (i == 2)
4149			alu.last = 1;
4150		r = r600_bytecode_add_alu(ctx->bc, &alu);
4151		if (r)
4152			return r;
4153	}
4154
4155	/* b * LOG2(a) */
4156	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4157	alu.op = ALU_OP2_MUL;
4158	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
4159	alu.src[1].sel = ctx->temp_reg;
4160	alu.dst.sel = ctx->temp_reg;
4161	alu.dst.write = 1;
4162	alu.last = 1;
4163	r = r600_bytecode_add_alu(ctx->bc, &alu);
4164	if (r)
4165		return r;
4166
4167	for (i = 0; i < last_slot; i++) {
4168		/* POW(a,b) = EXP2(b * LOG2(a))*/
4169		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4170		alu.op = ALU_OP1_EXP_IEEE;
4171		alu.src[0].sel = ctx->temp_reg;
4172
4173		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4174		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4175		if (i == last_slot - 1)
4176			alu.last = 1;
4177		r = r600_bytecode_add_alu(ctx->bc, &alu);
4178		if (r)
4179			return r;
4180	}
4181	return 0;
4182}
4183
4184static int tgsi_pow(struct r600_shader_ctx *ctx)
4185{
4186	struct r600_bytecode_alu alu;
4187	int r;
4188
4189	/* LOG2(a) */
4190	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4191	alu.op = ALU_OP1_LOG_IEEE;
4192	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4193	alu.dst.sel = ctx->temp_reg;
4194	alu.dst.write = 1;
4195	alu.last = 1;
4196	r = r600_bytecode_add_alu(ctx->bc, &alu);
4197	if (r)
4198		return r;
4199	/* b * LOG2(a) */
4200	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4201	alu.op = ALU_OP2_MUL;
4202	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
4203	alu.src[1].sel = ctx->temp_reg;
4204	alu.dst.sel = ctx->temp_reg;
4205	alu.dst.write = 1;
4206	alu.last = 1;
4207	r = r600_bytecode_add_alu(ctx->bc, &alu);
4208	if (r)
4209		return r;
4210	/* POW(a,b) = EXP2(b * LOG2(a))*/
4211	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4212	alu.op = ALU_OP1_EXP_IEEE;
4213	alu.src[0].sel = ctx->temp_reg;
4214	alu.dst.sel = ctx->temp_reg;
4215	alu.dst.write = 1;
4216	alu.last = 1;
4217	r = r600_bytecode_add_alu(ctx->bc, &alu);
4218	if (r)
4219		return r;
4220	return tgsi_helper_tempx_replicate(ctx);
4221}
4222
4223static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
4224{
4225	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4226	struct r600_bytecode_alu alu;
4227	int i, r, j;
4228	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4229	int tmp0 = ctx->temp_reg;
4230	int tmp1 = r600_get_temp(ctx);
4231	int tmp2 = r600_get_temp(ctx);
4232	int tmp3 = r600_get_temp(ctx);
4233	/* Unsigned path:
4234	 *
4235	 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
4236	 *
4237	 * 1. tmp0.x = rcp (src2)     = 2^32/src2 + e, where e is rounding error
4238	 * 2. tmp0.z = lo (tmp0.x * src2)
4239	 * 3. tmp0.w = -tmp0.z
4240	 * 4. tmp0.y = hi (tmp0.x * src2)
4241	 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src2))
4242	 * 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error
4243	 * 7. tmp1.x = tmp0.x - tmp0.w
4244	 * 8. tmp1.y = tmp0.x + tmp0.w
4245	 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
4246	 * 10. tmp0.z = hi(tmp0.x * src1)     = q
4247	 * 11. tmp0.y = lo (tmp0.z * src2)     = src2*q = src1 - r
4248	 *
4249	 * 12. tmp0.w = src1 - tmp0.y       = r
4250	 * 13. tmp1.x = tmp0.w >= src2		= r >= src2 (uint comparison)
4251	 * 14. tmp1.y = src1 >= tmp0.y      = r >= 0 (uint comparison)
4252	 *
4253	 * if DIV
4254	 *
4255	 *   15. tmp1.z = tmp0.z + 1			= q + 1
4256	 *   16. tmp1.w = tmp0.z - 1			= q - 1
4257	 *
4258	 * else MOD
4259	 *
4260	 *   15. tmp1.z = tmp0.w - src2			= r - src2
4261	 *   16. tmp1.w = tmp0.w + src2			= r + src2
4262	 *
4263	 * endif
4264	 *
4265	 * 17. tmp1.x = tmp1.x & tmp1.y
4266	 *
4267	 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
4268	 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
4269	 *
4270	 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
4271	 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
4272	 *
4273	 * Signed path:
4274	 *
4275	 * Same as unsigned, using abs values of the operands,
4276	 * and fixing the sign of the result in the end.
4277	 */
4278
4279	for (i = 0; i < 4; i++) {
4280		if (!(write_mask & (1<<i)))
4281			continue;
4282
4283		if (signed_op) {
4284
4285			/* tmp2.x = -src0 */
4286			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4287			alu.op = ALU_OP2_SUB_INT;
4288
4289			alu.dst.sel = tmp2;
4290			alu.dst.chan = 0;
4291			alu.dst.write = 1;
4292
4293			alu.src[0].sel = V_SQ_ALU_SRC_0;
4294
4295			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4296
4297			alu.last = 1;
4298			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4299				return r;
4300
4301			/* tmp2.y = -src1 */
4302			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4303			alu.op = ALU_OP2_SUB_INT;
4304
4305			alu.dst.sel = tmp2;
4306			alu.dst.chan = 1;
4307			alu.dst.write = 1;
4308
4309			alu.src[0].sel = V_SQ_ALU_SRC_0;
4310
4311			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4312
4313			alu.last = 1;
4314			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4315				return r;
4316
4317			/* tmp2.z sign bit is set if src0 and src2 signs are different */
4318			/* it will be a sign of the quotient */
4319			if (!mod) {
4320
4321				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4322				alu.op = ALU_OP2_XOR_INT;
4323
4324				alu.dst.sel = tmp2;
4325				alu.dst.chan = 2;
4326				alu.dst.write = 1;
4327
4328				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4329				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4330
4331				alu.last = 1;
4332				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4333					return r;
4334			}
4335
4336			/* tmp2.x = |src0| */
4337			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4338			alu.op = ALU_OP3_CNDGE_INT;
4339			alu.is_op3 = 1;
4340
4341			alu.dst.sel = tmp2;
4342			alu.dst.chan = 0;
4343			alu.dst.write = 1;
4344
4345			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4346			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4347			alu.src[2].sel = tmp2;
4348			alu.src[2].chan = 0;
4349
4350			alu.last = 1;
4351			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4352				return r;
4353
4354			/* tmp2.y = |src1| */
4355			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4356			alu.op = ALU_OP3_CNDGE_INT;
4357			alu.is_op3 = 1;
4358
4359			alu.dst.sel = tmp2;
4360			alu.dst.chan = 1;
4361			alu.dst.write = 1;
4362
4363			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4364			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4365			alu.src[2].sel = tmp2;
4366			alu.src[2].chan = 1;
4367
4368			alu.last = 1;
4369			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4370				return r;
4371
4372		}
4373
4374		/* 1. tmp0.x = rcp_u (src2)     = 2^32/src2 + e, where e is rounding error */
4375		if (ctx->bc->chip_class == CAYMAN) {
4376			/* tmp3.x = u2f(src2) */
4377			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4378			alu.op = ALU_OP1_UINT_TO_FLT;
4379
4380			alu.dst.sel = tmp3;
4381			alu.dst.chan = 0;
4382			alu.dst.write = 1;
4383
4384			if (signed_op) {
4385				alu.src[0].sel = tmp2;
4386				alu.src[0].chan = 1;
4387			} else {
4388				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4389			}
4390
4391			alu.last = 1;
4392			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4393				return r;
4394
4395			/* tmp0.x = recip(tmp3.x) */
4396			for (j = 0 ; j < 3; j++) {
4397				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4398				alu.op = ALU_OP1_RECIP_IEEE;
4399
4400				alu.dst.sel = tmp0;
4401				alu.dst.chan = j;
4402				alu.dst.write = (j == 0);
4403
4404				alu.src[0].sel = tmp3;
4405				alu.src[0].chan = 0;
4406
4407				if (j == 2)
4408					alu.last = 1;
4409				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4410					return r;
4411			}
4412
4413			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4414			alu.op = ALU_OP2_MUL;
4415
4416			alu.src[0].sel = tmp0;
4417			alu.src[0].chan = 0;
4418
4419			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4420			alu.src[1].value = 0x4f800000;
4421
4422			alu.dst.sel = tmp3;
4423			alu.dst.write = 1;
4424			alu.last = 1;
4425			r = r600_bytecode_add_alu(ctx->bc, &alu);
4426			if (r)
4427				return r;
4428
4429			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4430			alu.op = ALU_OP1_FLT_TO_UINT;
4431
4432			alu.dst.sel = tmp0;
4433			alu.dst.chan = 0;
4434			alu.dst.write = 1;
4435
4436			alu.src[0].sel = tmp3;
4437			alu.src[0].chan = 0;
4438
4439			alu.last = 1;
4440			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4441				return r;
4442
4443		} else {
4444			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4445			alu.op = ALU_OP1_RECIP_UINT;
4446
4447			alu.dst.sel = tmp0;
4448			alu.dst.chan = 0;
4449			alu.dst.write = 1;
4450
4451			if (signed_op) {
4452				alu.src[0].sel = tmp2;
4453				alu.src[0].chan = 1;
4454			} else {
4455				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4456			}
4457
4458			alu.last = 1;
4459			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4460				return r;
4461		}
4462
4463		/* 2. tmp0.z = lo (tmp0.x * src2) */
4464		if (ctx->bc->chip_class == CAYMAN) {
4465			for (j = 0 ; j < 4; j++) {
4466				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4467				alu.op = ALU_OP2_MULLO_UINT;
4468
4469				alu.dst.sel = tmp0;
4470				alu.dst.chan = j;
4471				alu.dst.write = (j == 2);
4472
4473				alu.src[0].sel = tmp0;
4474				alu.src[0].chan = 0;
4475				if (signed_op) {
4476					alu.src[1].sel = tmp2;
4477					alu.src[1].chan = 1;
4478				} else {
4479					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4480				}
4481
4482				alu.last = (j == 3);
4483				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4484					return r;
4485			}
4486		} else {
4487			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4488			alu.op = ALU_OP2_MULLO_UINT;
4489
4490			alu.dst.sel = tmp0;
4491			alu.dst.chan = 2;
4492			alu.dst.write = 1;
4493
4494			alu.src[0].sel = tmp0;
4495			alu.src[0].chan = 0;
4496			if (signed_op) {
4497				alu.src[1].sel = tmp2;
4498				alu.src[1].chan = 1;
4499			} else {
4500				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4501			}
4502
4503			alu.last = 1;
4504			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4505				return r;
4506		}
4507
4508		/* 3. tmp0.w = -tmp0.z */
4509		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4510		alu.op = ALU_OP2_SUB_INT;
4511
4512		alu.dst.sel = tmp0;
4513		alu.dst.chan = 3;
4514		alu.dst.write = 1;
4515
4516		alu.src[0].sel = V_SQ_ALU_SRC_0;
4517		alu.src[1].sel = tmp0;
4518		alu.src[1].chan = 2;
4519
4520		alu.last = 1;
4521		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4522			return r;
4523
4524		/* 4. tmp0.y = hi (tmp0.x * src2) */
4525		if (ctx->bc->chip_class == CAYMAN) {
4526			for (j = 0 ; j < 4; j++) {
4527				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4528				alu.op = ALU_OP2_MULHI_UINT;
4529
4530				alu.dst.sel = tmp0;
4531				alu.dst.chan = j;
4532				alu.dst.write = (j == 1);
4533
4534				alu.src[0].sel = tmp0;
4535				alu.src[0].chan = 0;
4536
4537				if (signed_op) {
4538					alu.src[1].sel = tmp2;
4539					alu.src[1].chan = 1;
4540				} else {
4541					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4542				}
4543				alu.last = (j == 3);
4544				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4545					return r;
4546			}
4547		} else {
4548			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4549			alu.op = ALU_OP2_MULHI_UINT;
4550
4551			alu.dst.sel = tmp0;
4552			alu.dst.chan = 1;
4553			alu.dst.write = 1;
4554
4555			alu.src[0].sel = tmp0;
4556			alu.src[0].chan = 0;
4557
4558			if (signed_op) {
4559				alu.src[1].sel = tmp2;
4560				alu.src[1].chan = 1;
4561			} else {
4562				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4563			}
4564
4565			alu.last = 1;
4566			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4567				return r;
4568		}
4569
4570		/* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src)) */
4571		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4572		alu.op = ALU_OP3_CNDE_INT;
4573		alu.is_op3 = 1;
4574
4575		alu.dst.sel = tmp0;
4576		alu.dst.chan = 2;
4577		alu.dst.write = 1;
4578
4579		alu.src[0].sel = tmp0;
4580		alu.src[0].chan = 1;
4581		alu.src[1].sel = tmp0;
4582		alu.src[1].chan = 3;
4583		alu.src[2].sel = tmp0;
4584		alu.src[2].chan = 2;
4585
4586		alu.last = 1;
4587		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4588			return r;
4589
4590		/* 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error */
4591		if (ctx->bc->chip_class == CAYMAN) {
4592			for (j = 0 ; j < 4; j++) {
4593				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4594				alu.op = ALU_OP2_MULHI_UINT;
4595
4596				alu.dst.sel = tmp0;
4597				alu.dst.chan = j;
4598				alu.dst.write = (j == 3);
4599
4600				alu.src[0].sel = tmp0;
4601				alu.src[0].chan = 2;
4602
4603				alu.src[1].sel = tmp0;
4604				alu.src[1].chan = 0;
4605
4606				alu.last = (j == 3);
4607				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4608					return r;
4609			}
4610		} else {
4611			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4612			alu.op = ALU_OP2_MULHI_UINT;
4613
4614			alu.dst.sel = tmp0;
4615			alu.dst.chan = 3;
4616			alu.dst.write = 1;
4617
4618			alu.src[0].sel = tmp0;
4619			alu.src[0].chan = 2;
4620
4621			alu.src[1].sel = tmp0;
4622			alu.src[1].chan = 0;
4623
4624			alu.last = 1;
4625			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4626				return r;
4627		}
4628
4629		/* 7. tmp1.x = tmp0.x - tmp0.w */
4630		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4631		alu.op = ALU_OP2_SUB_INT;
4632
4633		alu.dst.sel = tmp1;
4634		alu.dst.chan = 0;
4635		alu.dst.write = 1;
4636
4637		alu.src[0].sel = tmp0;
4638		alu.src[0].chan = 0;
4639		alu.src[1].sel = tmp0;
4640		alu.src[1].chan = 3;
4641
4642		alu.last = 1;
4643		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4644			return r;
4645
4646		/* 8. tmp1.y = tmp0.x + tmp0.w */
4647		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4648		alu.op = ALU_OP2_ADD_INT;
4649
4650		alu.dst.sel = tmp1;
4651		alu.dst.chan = 1;
4652		alu.dst.write = 1;
4653
4654		alu.src[0].sel = tmp0;
4655		alu.src[0].chan = 0;
4656		alu.src[1].sel = tmp0;
4657		alu.src[1].chan = 3;
4658
4659		alu.last = 1;
4660		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4661			return r;
4662
4663		/* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
4664		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4665		alu.op = ALU_OP3_CNDE_INT;
4666		alu.is_op3 = 1;
4667
4668		alu.dst.sel = tmp0;
4669		alu.dst.chan = 0;
4670		alu.dst.write = 1;
4671
4672		alu.src[0].sel = tmp0;
4673		alu.src[0].chan = 1;
4674		alu.src[1].sel = tmp1;
4675		alu.src[1].chan = 1;
4676		alu.src[2].sel = tmp1;
4677		alu.src[2].chan = 0;
4678
4679		alu.last = 1;
4680		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4681			return r;
4682
4683		/* 10. tmp0.z = hi(tmp0.x * src1)     = q */
4684		if (ctx->bc->chip_class == CAYMAN) {
4685			for (j = 0 ; j < 4; j++) {
4686				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4687				alu.op = ALU_OP2_MULHI_UINT;
4688
4689				alu.dst.sel = tmp0;
4690				alu.dst.chan = j;
4691				alu.dst.write = (j == 2);
4692
4693				alu.src[0].sel = tmp0;
4694				alu.src[0].chan = 0;
4695
4696				if (signed_op) {
4697					alu.src[1].sel = tmp2;
4698					alu.src[1].chan = 0;
4699				} else {
4700					r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4701				}
4702
4703				alu.last = (j == 3);
4704				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4705					return r;
4706			}
4707		} else {
4708			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4709			alu.op = ALU_OP2_MULHI_UINT;
4710
4711			alu.dst.sel = tmp0;
4712			alu.dst.chan = 2;
4713			alu.dst.write = 1;
4714
4715			alu.src[0].sel = tmp0;
4716			alu.src[0].chan = 0;
4717
4718			if (signed_op) {
4719				alu.src[1].sel = tmp2;
4720				alu.src[1].chan = 0;
4721			} else {
4722				r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4723			}
4724
4725			alu.last = 1;
4726			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4727				return r;
4728		}
4729
4730		/* 11. tmp0.y = lo (src2 * tmp0.z)     = src2*q = src1 - r */
4731		if (ctx->bc->chip_class == CAYMAN) {
4732			for (j = 0 ; j < 4; j++) {
4733				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4734				alu.op = ALU_OP2_MULLO_UINT;
4735
4736				alu.dst.sel = tmp0;
4737				alu.dst.chan = j;
4738				alu.dst.write = (j == 1);
4739
4740				if (signed_op) {
4741					alu.src[0].sel = tmp2;
4742					alu.src[0].chan = 1;
4743				} else {
4744					r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4745				}
4746
4747				alu.src[1].sel = tmp0;
4748				alu.src[1].chan = 2;
4749
4750				alu.last = (j == 3);
4751				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4752					return r;
4753			}
4754		} else {
4755			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4756			alu.op = ALU_OP2_MULLO_UINT;
4757
4758			alu.dst.sel = tmp0;
4759			alu.dst.chan = 1;
4760			alu.dst.write = 1;
4761
4762			if (signed_op) {
4763				alu.src[0].sel = tmp2;
4764				alu.src[0].chan = 1;
4765			} else {
4766				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4767			}
4768
4769			alu.src[1].sel = tmp0;
4770			alu.src[1].chan = 2;
4771
4772			alu.last = 1;
4773			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4774				return r;
4775		}
4776
4777		/* 12. tmp0.w = src1 - tmp0.y       = r */
4778		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4779		alu.op = ALU_OP2_SUB_INT;
4780
4781		alu.dst.sel = tmp0;
4782		alu.dst.chan = 3;
4783		alu.dst.write = 1;
4784
4785		if (signed_op) {
4786			alu.src[0].sel = tmp2;
4787			alu.src[0].chan = 0;
4788		} else {
4789			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4790		}
4791
4792		alu.src[1].sel = tmp0;
4793		alu.src[1].chan = 1;
4794
4795		alu.last = 1;
4796		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4797			return r;
4798
4799		/* 13. tmp1.x = tmp0.w >= src2		= r >= src2 */
4800		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4801		alu.op = ALU_OP2_SETGE_UINT;
4802
4803		alu.dst.sel = tmp1;
4804		alu.dst.chan = 0;
4805		alu.dst.write = 1;
4806
4807		alu.src[0].sel = tmp0;
4808		alu.src[0].chan = 3;
4809		if (signed_op) {
4810			alu.src[1].sel = tmp2;
4811			alu.src[1].chan = 1;
4812		} else {
4813			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4814		}
4815
4816		alu.last = 1;
4817		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4818			return r;
4819
4820		/* 14. tmp1.y = src1 >= tmp0.y       = r >= 0 */
4821		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4822		alu.op = ALU_OP2_SETGE_UINT;
4823
4824		alu.dst.sel = tmp1;
4825		alu.dst.chan = 1;
4826		alu.dst.write = 1;
4827
4828		if (signed_op) {
4829			alu.src[0].sel = tmp2;
4830			alu.src[0].chan = 0;
4831		} else {
4832			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4833		}
4834
4835		alu.src[1].sel = tmp0;
4836		alu.src[1].chan = 1;
4837
4838		alu.last = 1;
4839		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4840			return r;
4841
4842		if (mod) { /* UMOD */
4843
4844			/* 15. tmp1.z = tmp0.w - src2			= r - src2 */
4845			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4846			alu.op = ALU_OP2_SUB_INT;
4847
4848			alu.dst.sel = tmp1;
4849			alu.dst.chan = 2;
4850			alu.dst.write = 1;
4851
4852			alu.src[0].sel = tmp0;
4853			alu.src[0].chan = 3;
4854
4855			if (signed_op) {
4856				alu.src[1].sel = tmp2;
4857				alu.src[1].chan = 1;
4858			} else {
4859				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4860			}
4861
4862			alu.last = 1;
4863			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4864				return r;
4865
4866			/* 16. tmp1.w = tmp0.w + src2			= r + src2 */
4867			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4868			alu.op = ALU_OP2_ADD_INT;
4869
4870			alu.dst.sel = tmp1;
4871			alu.dst.chan = 3;
4872			alu.dst.write = 1;
4873
4874			alu.src[0].sel = tmp0;
4875			alu.src[0].chan = 3;
4876			if (signed_op) {
4877				alu.src[1].sel = tmp2;
4878				alu.src[1].chan = 1;
4879			} else {
4880				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4881			}
4882
4883			alu.last = 1;
4884			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4885				return r;
4886
4887		} else { /* UDIV */
4888
4889			/* 15. tmp1.z = tmp0.z + 1       = q + 1       DIV */
4890			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4891			alu.op = ALU_OP2_ADD_INT;
4892
4893			alu.dst.sel = tmp1;
4894			alu.dst.chan = 2;
4895			alu.dst.write = 1;
4896
4897			alu.src[0].sel = tmp0;
4898			alu.src[0].chan = 2;
4899			alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
4900
4901			alu.last = 1;
4902			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4903				return r;
4904
4905			/* 16. tmp1.w = tmp0.z - 1			= q - 1 */
4906			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4907			alu.op = ALU_OP2_ADD_INT;
4908
4909			alu.dst.sel = tmp1;
4910			alu.dst.chan = 3;
4911			alu.dst.write = 1;
4912
4913			alu.src[0].sel = tmp0;
4914			alu.src[0].chan = 2;
4915			alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
4916
4917			alu.last = 1;
4918			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4919				return r;
4920
4921		}
4922
4923		/* 17. tmp1.x = tmp1.x & tmp1.y */
4924		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4925		alu.op = ALU_OP2_AND_INT;
4926
4927		alu.dst.sel = tmp1;
4928		alu.dst.chan = 0;
4929		alu.dst.write = 1;
4930
4931		alu.src[0].sel = tmp1;
4932		alu.src[0].chan = 0;
4933		alu.src[1].sel = tmp1;
4934		alu.src[1].chan = 1;
4935
4936		alu.last = 1;
4937		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4938			return r;
4939
4940		/* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z    DIV */
4941		/* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z    MOD */
4942		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4943		alu.op = ALU_OP3_CNDE_INT;
4944		alu.is_op3 = 1;
4945
4946		alu.dst.sel = tmp0;
4947		alu.dst.chan = 2;
4948		alu.dst.write = 1;
4949
4950		alu.src[0].sel = tmp1;
4951		alu.src[0].chan = 0;
4952		alu.src[1].sel = tmp0;
4953		alu.src[1].chan = mod ? 3 : 2;
4954		alu.src[2].sel = tmp1;
4955		alu.src[2].chan = 2;
4956
4957		alu.last = 1;
4958		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4959			return r;
4960
4961		/* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
4962		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4963		alu.op = ALU_OP3_CNDE_INT;
4964		alu.is_op3 = 1;
4965
4966		if (signed_op) {
4967			alu.dst.sel = tmp0;
4968			alu.dst.chan = 2;
4969			alu.dst.write = 1;
4970		} else {
4971			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4972		}
4973
4974		alu.src[0].sel = tmp1;
4975		alu.src[0].chan = 1;
4976		alu.src[1].sel = tmp1;
4977		alu.src[1].chan = 3;
4978		alu.src[2].sel = tmp0;
4979		alu.src[2].chan = 2;
4980
4981		alu.last = 1;
4982		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4983			return r;
4984
4985		if (signed_op) {
4986
4987			/* fix the sign of the result */
4988
4989			if (mod) {
4990
4991				/* tmp0.x = -tmp0.z */
4992				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4993				alu.op = ALU_OP2_SUB_INT;
4994
4995				alu.dst.sel = tmp0;
4996				alu.dst.chan = 0;
4997				alu.dst.write = 1;
4998
4999				alu.src[0].sel = V_SQ_ALU_SRC_0;
5000				alu.src[1].sel = tmp0;
5001				alu.src[1].chan = 2;
5002
5003				alu.last = 1;
5004				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5005					return r;
5006
5007				/* sign of the remainder is the same as the sign of src0 */
5008				/* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
5009				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5010				alu.op = ALU_OP3_CNDGE_INT;
5011				alu.is_op3 = 1;
5012
5013				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5014
5015				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5016				alu.src[1].sel = tmp0;
5017				alu.src[1].chan = 2;
5018				alu.src[2].sel = tmp0;
5019				alu.src[2].chan = 0;
5020
5021				alu.last = 1;
5022				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5023					return r;
5024
5025			} else {
5026
5027				/* tmp0.x = -tmp0.z */
5028				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5029				alu.op = ALU_OP2_SUB_INT;
5030
5031				alu.dst.sel = tmp0;
5032				alu.dst.chan = 0;
5033				alu.dst.write = 1;
5034
5035				alu.src[0].sel = V_SQ_ALU_SRC_0;
5036				alu.src[1].sel = tmp0;
5037				alu.src[1].chan = 2;
5038
5039				alu.last = 1;
5040				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5041					return r;
5042
5043				/* fix the quotient sign (same as the sign of src0*src1) */
5044				/* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
5045				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5046				alu.op = ALU_OP3_CNDGE_INT;
5047				alu.is_op3 = 1;
5048
5049				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5050
5051				alu.src[0].sel = tmp2;
5052				alu.src[0].chan = 2;
5053				alu.src[1].sel = tmp0;
5054				alu.src[1].chan = 2;
5055				alu.src[2].sel = tmp0;
5056				alu.src[2].chan = 0;
5057
5058				alu.last = 1;
5059				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5060					return r;
5061			}
5062		}
5063	}
5064	return 0;
5065}
5066
5067static int tgsi_udiv(struct r600_shader_ctx *ctx)
5068{
5069	return tgsi_divmod(ctx, 0, 0);
5070}
5071
5072static int tgsi_umod(struct r600_shader_ctx *ctx)
5073{
5074	return tgsi_divmod(ctx, 1, 0);
5075}
5076
5077static int tgsi_idiv(struct r600_shader_ctx *ctx)
5078{
5079	return tgsi_divmod(ctx, 0, 1);
5080}
5081
5082static int tgsi_imod(struct r600_shader_ctx *ctx)
5083{
5084	return tgsi_divmod(ctx, 1, 1);
5085}
5086
5087
5088static int tgsi_f2i(struct r600_shader_ctx *ctx)
5089{
5090	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5091	struct r600_bytecode_alu alu;
5092	int i, r;
5093	unsigned write_mask = inst->Dst[0].Register.WriteMask;
5094	int last_inst = tgsi_last_instruction(write_mask);
5095
5096	for (i = 0; i < 4; i++) {
5097		if (!(write_mask & (1<<i)))
5098			continue;
5099
5100		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5101		alu.op = ALU_OP1_TRUNC;
5102
5103		alu.dst.sel = ctx->temp_reg;
5104		alu.dst.chan = i;
5105		alu.dst.write = 1;
5106
5107		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5108		if (i == last_inst)
5109			alu.last = 1;
5110		r = r600_bytecode_add_alu(ctx->bc, &alu);
5111		if (r)
5112			return r;
5113	}
5114
5115	for (i = 0; i < 4; i++) {
5116		if (!(write_mask & (1<<i)))
5117			continue;
5118
5119		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5120		alu.op = ctx->inst_info->op;
5121
5122		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5123
5124		alu.src[0].sel = ctx->temp_reg;
5125		alu.src[0].chan = i;
5126
5127		if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
5128			alu.last = 1;
5129		r = r600_bytecode_add_alu(ctx->bc, &alu);
5130		if (r)
5131			return r;
5132	}
5133
5134	return 0;
5135}
5136
5137static int tgsi_iabs(struct r600_shader_ctx *ctx)
5138{
5139	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5140	struct r600_bytecode_alu alu;
5141	int i, r;
5142	unsigned write_mask = inst->Dst[0].Register.WriteMask;
5143	int last_inst = tgsi_last_instruction(write_mask);
5144
5145	/* tmp = -src */
5146	for (i = 0; i < 4; i++) {
5147		if (!(write_mask & (1<<i)))
5148			continue;
5149
5150		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5151		alu.op = ALU_OP2_SUB_INT;
5152
5153		alu.dst.sel = ctx->temp_reg;
5154		alu.dst.chan = i;
5155		alu.dst.write = 1;
5156
5157		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5158		alu.src[0].sel = V_SQ_ALU_SRC_0;
5159
5160		if (i == last_inst)
5161			alu.last = 1;
5162		r = r600_bytecode_add_alu(ctx->bc, &alu);
5163		if (r)
5164			return r;
5165	}
5166
5167	/* dst = (src >= 0 ? src : tmp) */
5168	for (i = 0; i < 4; i++) {
5169		if (!(write_mask & (1<<i)))
5170			continue;
5171
5172		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5173		alu.op = ALU_OP3_CNDGE_INT;
5174		alu.is_op3 = 1;
5175		alu.dst.write = 1;
5176
5177		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5178
5179		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5180		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5181		alu.src[2].sel = ctx->temp_reg;
5182		alu.src[2].chan = i;
5183
5184		if (i == last_inst)
5185			alu.last = 1;
5186		r = r600_bytecode_add_alu(ctx->bc, &alu);
5187		if (r)
5188			return r;
5189	}
5190	return 0;
5191}
5192
5193static int tgsi_issg(struct r600_shader_ctx *ctx)
5194{
5195	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5196	struct r600_bytecode_alu alu;
5197	int i, r;
5198	unsigned write_mask = inst->Dst[0].Register.WriteMask;
5199	int last_inst = tgsi_last_instruction(write_mask);
5200
5201	/* tmp = (src >= 0 ? src : -1) */
5202	for (i = 0; i < 4; i++) {
5203		if (!(write_mask & (1<<i)))
5204			continue;
5205
5206		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5207		alu.op = ALU_OP3_CNDGE_INT;
5208		alu.is_op3 = 1;
5209
5210		alu.dst.sel = ctx->temp_reg;
5211		alu.dst.chan = i;
5212		alu.dst.write = 1;
5213
5214		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5215		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5216		alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
5217
5218		if (i == last_inst)
5219			alu.last = 1;
5220		r = r600_bytecode_add_alu(ctx->bc, &alu);
5221		if (r)
5222			return r;
5223	}
5224
5225	/* dst = (tmp > 0 ? 1 : tmp) */
5226	for (i = 0; i < 4; i++) {
5227		if (!(write_mask & (1<<i)))
5228			continue;
5229
5230		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5231		alu.op = ALU_OP3_CNDGT_INT;
5232		alu.is_op3 = 1;
5233		alu.dst.write = 1;
5234
5235		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5236
5237		alu.src[0].sel = ctx->temp_reg;
5238		alu.src[0].chan = i;
5239
5240		alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
5241
5242		alu.src[2].sel = ctx->temp_reg;
5243		alu.src[2].chan = i;
5244
5245		if (i == last_inst)
5246			alu.last = 1;
5247		r = r600_bytecode_add_alu(ctx->bc, &alu);
5248		if (r)
5249			return r;
5250	}
5251	return 0;
5252}
5253
5254
5255
5256static int tgsi_ssg(struct r600_shader_ctx *ctx)
5257{
5258	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5259	struct r600_bytecode_alu alu;
5260	int i, r;
5261
5262	/* tmp = (src > 0 ? 1 : src) */
5263	for (i = 0; i < 4; i++) {
5264		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5265		alu.op = ALU_OP3_CNDGT;
5266		alu.is_op3 = 1;
5267
5268		alu.dst.sel = ctx->temp_reg;
5269		alu.dst.chan = i;
5270
5271		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5272		alu.src[1].sel = V_SQ_ALU_SRC_1;
5273		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
5274
5275		if (i == 3)
5276			alu.last = 1;
5277		r = r600_bytecode_add_alu(ctx->bc, &alu);
5278		if (r)
5279			return r;
5280	}
5281
5282	/* dst = (-tmp > 0 ? -1 : tmp) */
5283	for (i = 0; i < 4; i++) {
5284		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5285		alu.op = ALU_OP3_CNDGT;
5286		alu.is_op3 = 1;
5287		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5288
5289		alu.src[0].sel = ctx->temp_reg;
5290		alu.src[0].chan = i;
5291		alu.src[0].neg = 1;
5292
5293		alu.src[1].sel = V_SQ_ALU_SRC_1;
5294		alu.src[1].neg = 1;
5295
5296		alu.src[2].sel = ctx->temp_reg;
5297		alu.src[2].chan = i;
5298
5299		if (i == 3)
5300			alu.last = 1;
5301		r = r600_bytecode_add_alu(ctx->bc, &alu);
5302		if (r)
5303			return r;
5304	}
5305	return 0;
5306}
5307
5308static int tgsi_bfi(struct r600_shader_ctx *ctx)
5309{
5310	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5311	struct r600_bytecode_alu alu;
5312	int i, r, t1, t2;
5313
5314	unsigned write_mask = inst->Dst[0].Register.WriteMask;
5315	int last_inst = tgsi_last_instruction(write_mask);
5316
5317	t1 = ctx->temp_reg;
5318
5319	for (i = 0; i < 4; i++) {
5320		if (!(write_mask & (1<<i)))
5321			continue;
5322
5323		/* create mask tmp */
5324		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5325		alu.op = ALU_OP2_BFM_INT;
5326		alu.dst.sel = t1;
5327		alu.dst.chan = i;
5328		alu.dst.write = 1;
5329		alu.last = i == last_inst;
5330
5331		r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
5332		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5333
5334		r = r600_bytecode_add_alu(ctx->bc, &alu);
5335		if (r)
5336			return r;
5337	}
5338
5339	t2 = r600_get_temp(ctx);
5340
5341	for (i = 0; i < 4; i++) {
5342		if (!(write_mask & (1<<i)))
5343			continue;
5344
5345		/* shift insert left */
5346		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5347		alu.op = ALU_OP2_LSHL_INT;
5348		alu.dst.sel = t2;
5349		alu.dst.chan = i;
5350		alu.dst.write = 1;
5351		alu.last = i == last_inst;
5352
5353		r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5354		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5355
5356		r = r600_bytecode_add_alu(ctx->bc, &alu);
5357		if (r)
5358			return r;
5359	}
5360
5361	for (i = 0; i < 4; i++) {
5362		if (!(write_mask & (1<<i)))
5363			continue;
5364
5365		/* actual bitfield insert */
5366		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5367		alu.op = ALU_OP3_BFI_INT;
5368		alu.is_op3 = 1;
5369		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5370		alu.dst.chan = i;
5371		alu.dst.write = 1;
5372		alu.last = i == last_inst;
5373
5374		alu.src[0].sel = t1;
5375		alu.src[0].chan = i;
5376		alu.src[1].sel = t2;
5377		alu.src[1].chan = i;
5378		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
5379
5380		r = r600_bytecode_add_alu(ctx->bc, &alu);
5381		if (r)
5382			return r;
5383	}
5384
5385	return 0;
5386}
5387
5388static int tgsi_msb(struct r600_shader_ctx *ctx)
5389{
5390	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5391	struct r600_bytecode_alu alu;
5392	int i, r, t1, t2;
5393
5394	unsigned write_mask = inst->Dst[0].Register.WriteMask;
5395	int last_inst = tgsi_last_instruction(write_mask);
5396
5397	assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
5398		ctx->inst_info->op == ALU_OP1_FFBH_UINT);
5399
5400	t1 = ctx->temp_reg;
5401
5402	/* bit position is indexed from lsb by TGSI, and from msb by the hardware */
5403	for (i = 0; i < 4; i++) {
5404		if (!(write_mask & (1<<i)))
5405			continue;
5406
5407		/* t1 = FFBH_INT / FFBH_UINT */
5408		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5409		alu.op = ctx->inst_info->op;
5410		alu.dst.sel = t1;
5411		alu.dst.chan = i;
5412		alu.dst.write = 1;
5413		alu.last = i == last_inst;
5414
5415		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5416
5417		r = r600_bytecode_add_alu(ctx->bc, &alu);
5418		if (r)
5419			return r;
5420	}
5421
5422	t2 = r600_get_temp(ctx);
5423
5424	for (i = 0; i < 4; i++) {
5425		if (!(write_mask & (1<<i)))
5426			continue;
5427
5428		/* t2 = 31 - t1 */
5429		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5430		alu.op = ALU_OP2_SUB_INT;
5431		alu.dst.sel = t2;
5432		alu.dst.chan = i;
5433		alu.dst.write = 1;
5434		alu.last = i == last_inst;
5435
5436		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
5437		alu.src[0].value = 31;
5438		alu.src[1].sel = t1;
5439		alu.src[1].chan = i;
5440
5441		r = r600_bytecode_add_alu(ctx->bc, &alu);
5442		if (r)
5443			return r;
5444	}
5445
5446	for (i = 0; i < 4; i++) {
5447		if (!(write_mask & (1<<i)))
5448			continue;
5449
5450		/* result = t1 >= 0 ? t2 : t1 */
5451		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5452		alu.op = ALU_OP3_CNDGE_INT;
5453		alu.is_op3 = 1;
5454		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5455		alu.dst.chan = i;
5456		alu.dst.write = 1;
5457		alu.last = i == last_inst;
5458
5459		alu.src[0].sel = t1;
5460		alu.src[0].chan = i;
5461		alu.src[1].sel = t2;
5462		alu.src[1].chan = i;
5463		alu.src[2].sel = t1;
5464		alu.src[2].chan = i;
5465
5466		r = r600_bytecode_add_alu(ctx->bc, &alu);
5467		if (r)
5468			return r;
5469	}
5470
5471	return 0;
5472}
5473
5474static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
5475{
5476	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5477	struct r600_bytecode_alu alu;
5478	int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
5479	unsigned location;
5480	int input;
5481
5482	assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5483
5484	input = inst->Src[0].Register.Index;
5485
5486	/* Interpolators have been marked for use already by allocate_system_value_inputs */
5487	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5488		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5489		location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
5490	}
5491	else {
5492		location = TGSI_INTERPOLATE_LOC_CENTROID;
5493	}
5494
5495	k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
5496	if (k < 0)
5497		k = 0;
5498	interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
5499	interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
5500
5501	/* NOTE: currently offset is not perspective correct */
5502	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5503		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5504		int sample_gpr = -1;
5505		int gradientsH, gradientsV;
5506		struct r600_bytecode_tex tex;
5507
5508		if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5509			sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
5510		}
5511
5512		gradientsH = r600_get_temp(ctx);
5513		gradientsV = r600_get_temp(ctx);
5514		for (i = 0; i < 2; i++) {
5515			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
5516			tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
5517			tex.src_gpr = interp_gpr;
5518			tex.src_sel_x = interp_base_chan + 0;
5519			tex.src_sel_y = interp_base_chan + 1;
5520			tex.src_sel_z = 0;
5521			tex.src_sel_w = 0;
5522			tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
5523			tex.dst_sel_x = 0;
5524			tex.dst_sel_y = 1;
5525			tex.dst_sel_z = 7;
5526			tex.dst_sel_w = 7;
5527			tex.inst_mod = 1; // Use per pixel gradient calculation
5528			tex.sampler_id = 0;
5529			tex.resource_id = tex.sampler_id;
5530			r = r600_bytecode_add_tex(ctx->bc, &tex);
5531			if (r)
5532				return r;
5533		}
5534
5535		for (i = 0; i < 2; i++) {
5536			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5537			alu.op = ALU_OP3_MULADD;
5538			alu.is_op3 = 1;
5539			alu.src[0].sel = gradientsH;
5540			alu.src[0].chan = i;
5541			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5542				alu.src[1].sel = sample_gpr;
5543				alu.src[1].chan = 2;
5544			}
5545			else {
5546				r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
5547			}
5548			alu.src[2].sel = interp_gpr;
5549			alu.src[2].chan = interp_base_chan + i;
5550			alu.dst.sel = ctx->temp_reg;
5551			alu.dst.chan = i;
5552			alu.last = i == 1;
5553
5554			r = r600_bytecode_add_alu(ctx->bc, &alu);
5555			if (r)
5556				return r;
5557		}
5558
5559		for (i = 0; i < 2; i++) {
5560			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5561			alu.op = ALU_OP3_MULADD;
5562			alu.is_op3 = 1;
5563			alu.src[0].sel = gradientsV;
5564			alu.src[0].chan = i;
5565			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5566				alu.src[1].sel = sample_gpr;
5567				alu.src[1].chan = 3;
5568			}
5569			else {
5570				r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
5571			}
5572			alu.src[2].sel = ctx->temp_reg;
5573			alu.src[2].chan = i;
5574			alu.dst.sel = ctx->temp_reg;
5575			alu.dst.chan = i;
5576			alu.last = i == 1;
5577
5578			r = r600_bytecode_add_alu(ctx->bc, &alu);
5579			if (r)
5580				return r;
5581		}
5582	}
5583
5584	tmp = r600_get_temp(ctx);
5585	for (i = 0; i < 8; i++) {
5586		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5587		alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
5588
5589		alu.dst.sel = tmp;
5590		if ((i > 1 && i < 6)) {
5591			alu.dst.write = 1;
5592		}
5593		else {
5594			alu.dst.write = 0;
5595		}
5596		alu.dst.chan = i % 4;
5597
5598		if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5599			inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5600			alu.src[0].sel = ctx->temp_reg;
5601			alu.src[0].chan = 1 - (i % 2);
5602		} else {
5603			alu.src[0].sel = interp_gpr;
5604			alu.src[0].chan = interp_base_chan + 1 - (i % 2);
5605		}
5606		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
5607		alu.src[1].chan = 0;
5608
5609		alu.last = i % 4 == 3;
5610		alu.bank_swizzle_force = SQ_ALU_VEC_210;
5611
5612		r = r600_bytecode_add_alu(ctx->bc, &alu);
5613		if (r)
5614			return r;
5615	}
5616
5617	// INTERP can't swizzle dst
5618	lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5619	for (i = 0; i <= lasti; i++) {
5620		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5621			continue;
5622
5623		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5624		alu.op = ALU_OP1_MOV;
5625		alu.src[0].sel = tmp;
5626		alu.src[0].chan = ctx->src[0].swizzle[i];
5627		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5628		alu.dst.write = 1;
5629		alu.last = i == lasti;
5630		r = r600_bytecode_add_alu(ctx->bc, &alu);
5631		if (r)
5632			return r;
5633	}
5634
5635	return 0;
5636}
5637
5638
5639static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
5640{
5641	struct r600_bytecode_alu alu;
5642	int i, r;
5643
5644	for (i = 0; i < 4; i++) {
5645		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5646		if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
5647			alu.op = ALU_OP0_NOP;
5648			alu.dst.chan = i;
5649		} else {
5650			alu.op = ALU_OP1_MOV;
5651			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5652			alu.src[0].sel = ctx->temp_reg;
5653			alu.src[0].chan = i;
5654		}
5655		if (i == 3) {
5656			alu.last = 1;
5657		}
5658		r = r600_bytecode_add_alu(ctx->bc, &alu);
5659		if (r)
5660			return r;
5661	}
5662	return 0;
5663}
5664
5665static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
5666                                 unsigned temp, int chan,
5667                                 struct r600_bytecode_alu_src *bc_src,
5668                                 const struct r600_shader_src *shader_src)
5669{
5670	struct r600_bytecode_alu alu;
5671	int r;
5672
5673	r600_bytecode_src(bc_src, shader_src, chan);
5674
5675	/* op3 operands don't support abs modifier */
5676	if (bc_src->abs) {
5677		assert(temp!=0);      /* we actually need the extra register, make sure it is allocated. */
5678		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5679		alu.op = ALU_OP1_MOV;
5680		alu.dst.sel = temp;
5681		alu.dst.chan = chan;
5682		alu.dst.write = 1;
5683
5684		alu.src[0] = *bc_src;
5685		alu.last = true; // sufficient?
5686		r = r600_bytecode_add_alu(ctx->bc, &alu);
5687		if (r)
5688			return r;
5689
5690		memset(bc_src, 0, sizeof(*bc_src));
5691		bc_src->sel = temp;
5692		bc_src->chan = chan;
5693	}
5694	return 0;
5695}
5696
5697static int tgsi_op3(struct r600_shader_ctx *ctx)
5698{
5699	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5700	struct r600_bytecode_alu alu;
5701	int i, j, r;
5702	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5703	int temp_regs[4];
5704
5705	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5706		temp_regs[j] = 0;
5707		if (ctx->src[j].abs)
5708			temp_regs[j] = r600_get_temp(ctx);
5709	}
5710	for (i = 0; i < lasti + 1; i++) {
5711		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5712			continue;
5713
5714		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5715		alu.op = ctx->inst_info->op;
5716		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5717			r = tgsi_make_src_for_op3(ctx, temp_regs[j], i, &alu.src[j], &ctx->src[j]);
5718			if (r)
5719				return r;
5720		}
5721
5722		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5723		alu.dst.chan = i;
5724		alu.dst.write = 1;
5725		alu.is_op3 = 1;
5726		if (i == lasti) {
5727			alu.last = 1;
5728		}
5729		r = r600_bytecode_add_alu(ctx->bc, &alu);
5730		if (r)
5731			return r;
5732	}
5733	return 0;
5734}
5735
5736static int tgsi_dp(struct r600_shader_ctx *ctx)
5737{
5738	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5739	struct r600_bytecode_alu alu;
5740	int i, j, r;
5741
5742	for (i = 0; i < 4; i++) {
5743		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5744		alu.op = ctx->inst_info->op;
5745		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5746			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
5747		}
5748
5749		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5750		alu.dst.chan = i;
5751		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5752		/* handle some special cases */
5753		switch (inst->Instruction.Opcode) {
5754		case TGSI_OPCODE_DP2:
5755			if (i > 1) {
5756				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
5757				alu.src[0].chan = alu.src[1].chan = 0;
5758			}
5759			break;
5760		case TGSI_OPCODE_DP3:
5761			if (i > 2) {
5762				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
5763				alu.src[0].chan = alu.src[1].chan = 0;
5764			}
5765			break;
5766		case TGSI_OPCODE_DPH:
5767			if (i == 3) {
5768				alu.src[0].sel = V_SQ_ALU_SRC_1;
5769				alu.src[0].chan = 0;
5770				alu.src[0].neg = 0;
5771			}
5772			break;
5773		default:
5774			break;
5775		}
5776		if (i == 3) {
5777			alu.last = 1;
5778		}
5779		r = r600_bytecode_add_alu(ctx->bc, &alu);
5780		if (r)
5781			return r;
5782	}
5783	return 0;
5784}
5785
5786static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
5787						    unsigned index)
5788{
5789	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5790	return 	(inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
5791		inst->Src[index].Register.File != TGSI_FILE_INPUT &&
5792		inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
5793		ctx->src[index].neg || ctx->src[index].abs ||
5794		(inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == TGSI_PROCESSOR_GEOMETRY);
5795}
5796
5797static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
5798					unsigned index)
5799{
5800	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5801	return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
5802}
5803
5804static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
5805{
5806	struct r600_bytecode_vtx vtx;
5807	struct r600_bytecode_alu alu;
5808	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5809	int src_gpr, r, i;
5810	int id = tgsi_tex_get_src_gpr(ctx, 1);
5811
5812	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
5813	if (src_requires_loading) {
5814		for (i = 0; i < 4; i++) {
5815			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5816			alu.op = ALU_OP1_MOV;
5817			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5818			alu.dst.sel = ctx->temp_reg;
5819			alu.dst.chan = i;
5820			if (i == 3)
5821				alu.last = 1;
5822			alu.dst.write = 1;
5823			r = r600_bytecode_add_alu(ctx->bc, &alu);
5824			if (r)
5825				return r;
5826		}
5827		src_gpr = ctx->temp_reg;
5828	}
5829
5830	memset(&vtx, 0, sizeof(vtx));
5831	vtx.op = FETCH_OP_VFETCH;
5832	vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
5833	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
5834	vtx.src_gpr = src_gpr;
5835	vtx.mega_fetch_count = 16;
5836	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
5837	vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;		/* SEL_X */
5838	vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;		/* SEL_Y */
5839	vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;		/* SEL_Z */
5840	vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;		/* SEL_W */
5841	vtx.use_const_fields = 1;
5842
5843	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
5844		return r;
5845
5846	if (ctx->bc->chip_class >= EVERGREEN)
5847		return 0;
5848
5849	for (i = 0; i < 4; i++) {
5850		int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5851		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5852			continue;
5853
5854		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5855		alu.op = ALU_OP2_AND_INT;
5856
5857		alu.dst.chan = i;
5858		alu.dst.sel = vtx.dst_gpr;
5859		alu.dst.write = 1;
5860
5861		alu.src[0].sel = vtx.dst_gpr;
5862		alu.src[0].chan = i;
5863
5864		alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;
5865		alu.src[1].sel += (id * 2);
5866		alu.src[1].chan = i % 4;
5867		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
5868
5869		if (i == lasti)
5870			alu.last = 1;
5871		r = r600_bytecode_add_alu(ctx->bc, &alu);
5872		if (r)
5873			return r;
5874	}
5875
5876	if (inst->Dst[0].Register.WriteMask & 3) {
5877		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5878		alu.op = ALU_OP2_OR_INT;
5879
5880		alu.dst.chan = 3;
5881		alu.dst.sel = vtx.dst_gpr;
5882		alu.dst.write = 1;
5883
5884		alu.src[0].sel = vtx.dst_gpr;
5885		alu.src[0].chan = 3;
5886
5887		alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;
5888		alu.src[1].chan = 0;
5889		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
5890
5891		alu.last = 1;
5892		r = r600_bytecode_add_alu(ctx->bc, &alu);
5893		if (r)
5894			return r;
5895	}
5896	return 0;
5897}
5898
5899static int r600_do_buffer_txq(struct r600_shader_ctx *ctx)
5900{
5901	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5902	struct r600_bytecode_alu alu;
5903	int r;
5904	int id = tgsi_tex_get_src_gpr(ctx, 1);
5905
5906	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5907	alu.op = ALU_OP1_MOV;
5908	alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
5909	if (ctx->bc->chip_class >= EVERGREEN) {
5910		/* channel 0 or 2 of each word */
5911		alu.src[0].sel += (id / 2);
5912		alu.src[0].chan = (id % 2) * 2;
5913	} else {
5914		/* r600 we have them at channel 2 of the second dword */
5915		alu.src[0].sel += (id * 2) + 1;
5916		alu.src[0].chan = 1;
5917	}
5918	alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
5919	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
5920	alu.last = 1;
5921	r = r600_bytecode_add_alu(ctx->bc, &alu);
5922	if (r)
5923		return r;
5924	return 0;
5925}
5926
5927static int tgsi_tex(struct r600_shader_ctx *ctx)
5928{
5929	static float one_point_five = 1.5f;
5930	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5931	struct r600_bytecode_tex tex;
5932	struct r600_bytecode_alu alu;
5933	unsigned src_gpr;
5934	int r, i, j;
5935	int opcode;
5936	bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
5937				    inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
5938				    (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
5939				     inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
5940
5941	bool txf_add_offsets = inst->Texture.NumOffsets &&
5942			     inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
5943			     inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
5944
5945	/* Texture fetch instructions can only use gprs as source.
5946	 * Also they cannot negate the source or take the absolute value */
5947	const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
5948					      inst->Instruction.Opcode != TGSI_OPCODE_TXQS &&
5949                                              tgsi_tex_src_requires_loading(ctx, 0)) ||
5950					     read_compressed_msaa || txf_add_offsets;
5951
5952	boolean src_loaded = FALSE;
5953	unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1;
5954	int8_t offset_x = 0, offset_y = 0, offset_z = 0;
5955	boolean has_txq_cube_array_z = false;
5956	unsigned sampler_index_mode;
5957
5958	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
5959	    ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5960	      inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
5961		if (inst->Dst[0].Register.WriteMask & 4) {
5962			ctx->shader->has_txq_cube_array_z_comp = true;
5963			has_txq_cube_array_z = true;
5964		}
5965
5966	if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
5967	    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
5968	    inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
5969	    inst->Instruction.Opcode == TGSI_OPCODE_TG4)
5970		sampler_src_reg = 2;
5971
5972	/* TGSI moves the sampler to src reg 3 for TXD */
5973	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
5974		sampler_src_reg = 3;
5975
5976	sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
5977
5978	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
5979
5980	if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
5981		if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
5982			ctx->shader->uses_tex_buffers = true;
5983			return r600_do_buffer_txq(ctx);
5984		}
5985		else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
5986			if (ctx->bc->chip_class < EVERGREEN)
5987				ctx->shader->uses_tex_buffers = true;
5988			return do_vtx_fetch_inst(ctx, src_requires_loading);
5989		}
5990	}
5991
5992	if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
5993		int out_chan;
5994		/* Add perspective divide */
5995		if (ctx->bc->chip_class == CAYMAN) {
5996			out_chan = 2;
5997			for (i = 0; i < 3; i++) {
5998				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5999				alu.op = ALU_OP1_RECIP_IEEE;
6000				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6001
6002				alu.dst.sel = ctx->temp_reg;
6003				alu.dst.chan = i;
6004				if (i == 2)
6005					alu.last = 1;
6006				if (out_chan == i)
6007					alu.dst.write = 1;
6008				r = r600_bytecode_add_alu(ctx->bc, &alu);
6009				if (r)
6010					return r;
6011			}
6012
6013		} else {
6014			out_chan = 3;
6015			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6016			alu.op = ALU_OP1_RECIP_IEEE;
6017			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6018
6019			alu.dst.sel = ctx->temp_reg;
6020			alu.dst.chan = out_chan;
6021			alu.last = 1;
6022			alu.dst.write = 1;
6023			r = r600_bytecode_add_alu(ctx->bc, &alu);
6024			if (r)
6025				return r;
6026		}
6027
6028		for (i = 0; i < 3; i++) {
6029			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6030			alu.op = ALU_OP2_MUL;
6031			alu.src[0].sel = ctx->temp_reg;
6032			alu.src[0].chan = out_chan;
6033			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6034			alu.dst.sel = ctx->temp_reg;
6035			alu.dst.chan = i;
6036			alu.dst.write = 1;
6037			r = r600_bytecode_add_alu(ctx->bc, &alu);
6038			if (r)
6039				return r;
6040		}
6041		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6042		alu.op = ALU_OP1_MOV;
6043		alu.src[0].sel = V_SQ_ALU_SRC_1;
6044		alu.src[0].chan = 0;
6045		alu.dst.sel = ctx->temp_reg;
6046		alu.dst.chan = 3;
6047		alu.last = 1;
6048		alu.dst.write = 1;
6049		r = r600_bytecode_add_alu(ctx->bc, &alu);
6050		if (r)
6051			return r;
6052		src_loaded = TRUE;
6053		src_gpr = ctx->temp_reg;
6054	}
6055
6056
6057	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
6058	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6059	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
6060	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
6061	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
6062	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
6063
6064		static const unsigned src0_swizzle[] = {2, 2, 0, 1};
6065		static const unsigned src1_swizzle[] = {1, 0, 2, 2};
6066
6067		/* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
6068		for (i = 0; i < 4; i++) {
6069			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6070			alu.op = ALU_OP2_CUBE;
6071			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
6072			r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
6073			alu.dst.sel = ctx->temp_reg;
6074			alu.dst.chan = i;
6075			if (i == 3)
6076				alu.last = 1;
6077			alu.dst.write = 1;
6078			r = r600_bytecode_add_alu(ctx->bc, &alu);
6079			if (r)
6080				return r;
6081		}
6082
6083		/* tmp1.z = RCP_e(|tmp1.z|) */
6084		if (ctx->bc->chip_class == CAYMAN) {
6085			for (i = 0; i < 3; i++) {
6086				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6087				alu.op = ALU_OP1_RECIP_IEEE;
6088				alu.src[0].sel = ctx->temp_reg;
6089				alu.src[0].chan = 2;
6090				alu.src[0].abs = 1;
6091				alu.dst.sel = ctx->temp_reg;
6092				alu.dst.chan = i;
6093				if (i == 2)
6094					alu.dst.write = 1;
6095				if (i == 2)
6096					alu.last = 1;
6097				r = r600_bytecode_add_alu(ctx->bc, &alu);
6098				if (r)
6099					return r;
6100			}
6101		} else {
6102			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6103			alu.op = ALU_OP1_RECIP_IEEE;
6104			alu.src[0].sel = ctx->temp_reg;
6105			alu.src[0].chan = 2;
6106			alu.src[0].abs = 1;
6107			alu.dst.sel = ctx->temp_reg;
6108			alu.dst.chan = 2;
6109			alu.dst.write = 1;
6110			alu.last = 1;
6111			r = r600_bytecode_add_alu(ctx->bc, &alu);
6112			if (r)
6113				return r;
6114		}
6115
6116		/* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
6117		 * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
6118		 * muladd has no writemask, have to use another temp
6119		 */
6120		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6121		alu.op = ALU_OP3_MULADD;
6122		alu.is_op3 = 1;
6123
6124		alu.src[0].sel = ctx->temp_reg;
6125		alu.src[0].chan = 0;
6126		alu.src[1].sel = ctx->temp_reg;
6127		alu.src[1].chan = 2;
6128
6129		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
6130		alu.src[2].chan = 0;
6131		alu.src[2].value = *(uint32_t *)&one_point_five;
6132
6133		alu.dst.sel = ctx->temp_reg;
6134		alu.dst.chan = 0;
6135		alu.dst.write = 1;
6136
6137		r = r600_bytecode_add_alu(ctx->bc, &alu);
6138		if (r)
6139			return r;
6140
6141		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6142		alu.op = ALU_OP3_MULADD;
6143		alu.is_op3 = 1;
6144
6145		alu.src[0].sel = ctx->temp_reg;
6146		alu.src[0].chan = 1;
6147		alu.src[1].sel = ctx->temp_reg;
6148		alu.src[1].chan = 2;
6149
6150		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
6151		alu.src[2].chan = 0;
6152		alu.src[2].value = *(uint32_t *)&one_point_five;
6153
6154		alu.dst.sel = ctx->temp_reg;
6155		alu.dst.chan = 1;
6156		alu.dst.write = 1;
6157
6158		alu.last = 1;
6159		r = r600_bytecode_add_alu(ctx->bc, &alu);
6160		if (r)
6161			return r;
6162		/* write initial compare value into Z component
6163		  - W src 0 for shadow cube
6164		  - X src 1 for shadow cube array */
6165		if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
6166		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
6167			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6168			alu.op = ALU_OP1_MOV;
6169			if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
6170				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
6171			else
6172				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6173			alu.dst.sel = ctx->temp_reg;
6174			alu.dst.chan = 2;
6175			alu.dst.write = 1;
6176			alu.last = 1;
6177			r = r600_bytecode_add_alu(ctx->bc, &alu);
6178			if (r)
6179				return r;
6180		}
6181
6182		if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6183		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
6184			if (ctx->bc->chip_class >= EVERGREEN) {
6185				int mytmp = r600_get_temp(ctx);
6186				static const float eight = 8.0f;
6187				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6188				alu.op = ALU_OP1_MOV;
6189				alu.src[0].sel = ctx->temp_reg;
6190				alu.src[0].chan = 3;
6191				alu.dst.sel = mytmp;
6192				alu.dst.chan = 0;
6193				alu.dst.write = 1;
6194				alu.last = 1;
6195				r = r600_bytecode_add_alu(ctx->bc, &alu);
6196				if (r)
6197					return r;
6198
6199				/* have to multiply original layer by 8 and add to face id (temp.w) in Z */
6200				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6201				alu.op = ALU_OP3_MULADD;
6202				alu.is_op3 = 1;
6203				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6204				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6205				alu.src[1].chan = 0;
6206				alu.src[1].value = *(uint32_t *)&eight;
6207				alu.src[2].sel = mytmp;
6208				alu.src[2].chan = 0;
6209				alu.dst.sel = ctx->temp_reg;
6210				alu.dst.chan = 3;
6211				alu.dst.write = 1;
6212				alu.last = 1;
6213				r = r600_bytecode_add_alu(ctx->bc, &alu);
6214				if (r)
6215					return r;
6216			} else if (ctx->bc->chip_class < EVERGREEN) {
6217				memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6218				tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
6219				tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6220				tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6221				tex.src_gpr = r600_get_temp(ctx);
6222				tex.src_sel_x = 0;
6223				tex.src_sel_y = 0;
6224				tex.src_sel_z = 0;
6225				tex.src_sel_w = 0;
6226				tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
6227				tex.coord_type_x = 1;
6228				tex.coord_type_y = 1;
6229				tex.coord_type_z = 1;
6230				tex.coord_type_w = 1;
6231				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6232				alu.op = ALU_OP1_MOV;
6233				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6234				alu.dst.sel = tex.src_gpr;
6235				alu.dst.chan = 0;
6236				alu.last = 1;
6237				alu.dst.write = 1;
6238				r = r600_bytecode_add_alu(ctx->bc, &alu);
6239				if (r)
6240					return r;
6241
6242				r = r600_bytecode_add_tex(ctx->bc, &tex);
6243				if (r)
6244					return r;
6245			}
6246
6247		}
6248
6249		/* for cube forms of lod and bias we need to route things */
6250		if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
6251		    inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
6252		    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
6253		    inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
6254			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6255			alu.op = ALU_OP1_MOV;
6256			if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
6257			    inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
6258				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
6259			else
6260				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6261			alu.dst.sel = ctx->temp_reg;
6262			alu.dst.chan = 2;
6263			alu.last = 1;
6264			alu.dst.write = 1;
6265			r = r600_bytecode_add_alu(ctx->bc, &alu);
6266			if (r)
6267				return r;
6268		}
6269
6270		src_loaded = TRUE;
6271		src_gpr = ctx->temp_reg;
6272	}
6273
6274	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
6275		int temp_h = 0, temp_v = 0;
6276		int start_val = 0;
6277
6278		/* if we've already loaded the src (i.e. CUBE don't reload it). */
6279		if (src_loaded == TRUE)
6280			start_val = 1;
6281		else
6282			src_loaded = TRUE;
6283		for (i = start_val; i < 3; i++) {
6284			int treg = r600_get_temp(ctx);
6285
6286			if (i == 0)
6287				src_gpr = treg;
6288			else if (i == 1)
6289				temp_h = treg;
6290			else
6291				temp_v = treg;
6292
6293			for (j = 0; j < 4; j++) {
6294				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6295				alu.op = ALU_OP1_MOV;
6296                                r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
6297                                alu.dst.sel = treg;
6298                                alu.dst.chan = j;
6299                                if (j == 3)
6300                                   alu.last = 1;
6301                                alu.dst.write = 1;
6302                                r = r600_bytecode_add_alu(ctx->bc, &alu);
6303                                if (r)
6304                                    return r;
6305			}
6306		}
6307		for (i = 1; i < 3; i++) {
6308			/* set gradients h/v */
6309			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6310			tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
6311				FETCH_OP_SET_GRADIENTS_V;
6312			tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6313			tex.sampler_index_mode = sampler_index_mode;
6314			tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6315			tex.resource_index_mode = sampler_index_mode;
6316
6317			tex.src_gpr = (i == 1) ? temp_h : temp_v;
6318			tex.src_sel_x = 0;
6319			tex.src_sel_y = 1;
6320			tex.src_sel_z = 2;
6321			tex.src_sel_w = 3;
6322
6323			tex.dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
6324			tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
6325			if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
6326				tex.coord_type_x = 1;
6327				tex.coord_type_y = 1;
6328				tex.coord_type_z = 1;
6329				tex.coord_type_w = 1;
6330			}
6331			r = r600_bytecode_add_tex(ctx->bc, &tex);
6332			if (r)
6333				return r;
6334		}
6335	}
6336
6337	if (src_requires_loading && !src_loaded) {
6338		for (i = 0; i < 4; i++) {
6339			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6340			alu.op = ALU_OP1_MOV;
6341			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6342			alu.dst.sel = ctx->temp_reg;
6343			alu.dst.chan = i;
6344			if (i == 3)
6345				alu.last = 1;
6346			alu.dst.write = 1;
6347			r = r600_bytecode_add_alu(ctx->bc, &alu);
6348			if (r)
6349				return r;
6350		}
6351		src_loaded = TRUE;
6352		src_gpr = ctx->temp_reg;
6353	}
6354
6355	/* get offset values */
6356	if (inst->Texture.NumOffsets) {
6357		assert(inst->Texture.NumOffsets == 1);
6358
6359		/* The texture offset feature doesn't work with the TXF instruction
6360		 * and must be emulated by adding the offset to the texture coordinates. */
6361		if (txf_add_offsets) {
6362			const struct tgsi_texture_offset *off = inst->TexOffsets;
6363
6364			switch (inst->Texture.Texture) {
6365			case TGSI_TEXTURE_3D:
6366				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6367				alu.op = ALU_OP2_ADD_INT;
6368				alu.src[0].sel = src_gpr;
6369				alu.src[0].chan = 2;
6370				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6371				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
6372				alu.dst.sel = src_gpr;
6373				alu.dst.chan = 2;
6374				alu.dst.write = 1;
6375				alu.last = 1;
6376				r = r600_bytecode_add_alu(ctx->bc, &alu);
6377				if (r)
6378					return r;
6379				/* fall through */
6380
6381			case TGSI_TEXTURE_2D:
6382			case TGSI_TEXTURE_SHADOW2D:
6383			case TGSI_TEXTURE_RECT:
6384			case TGSI_TEXTURE_SHADOWRECT:
6385			case TGSI_TEXTURE_2D_ARRAY:
6386			case TGSI_TEXTURE_SHADOW2D_ARRAY:
6387				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6388				alu.op = ALU_OP2_ADD_INT;
6389				alu.src[0].sel = src_gpr;
6390				alu.src[0].chan = 1;
6391				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6392				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
6393				alu.dst.sel = src_gpr;
6394				alu.dst.chan = 1;
6395				alu.dst.write = 1;
6396				alu.last = 1;
6397				r = r600_bytecode_add_alu(ctx->bc, &alu);
6398				if (r)
6399					return r;
6400				/* fall through */
6401
6402			case TGSI_TEXTURE_1D:
6403			case TGSI_TEXTURE_SHADOW1D:
6404			case TGSI_TEXTURE_1D_ARRAY:
6405			case TGSI_TEXTURE_SHADOW1D_ARRAY:
6406				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6407				alu.op = ALU_OP2_ADD_INT;
6408				alu.src[0].sel = src_gpr;
6409				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6410				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
6411				alu.dst.sel = src_gpr;
6412				alu.dst.write = 1;
6413				alu.last = 1;
6414				r = r600_bytecode_add_alu(ctx->bc, &alu);
6415				if (r)
6416					return r;
6417				break;
6418				/* texture offsets do not apply to other texture targets */
6419			}
6420		} else {
6421			switch (inst->Texture.Texture) {
6422			case TGSI_TEXTURE_3D:
6423				offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
6424				/* fallthrough */
6425			case TGSI_TEXTURE_2D:
6426			case TGSI_TEXTURE_SHADOW2D:
6427			case TGSI_TEXTURE_RECT:
6428			case TGSI_TEXTURE_SHADOWRECT:
6429			case TGSI_TEXTURE_2D_ARRAY:
6430			case TGSI_TEXTURE_SHADOW2D_ARRAY:
6431				offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
6432				/* fallthrough */
6433			case TGSI_TEXTURE_1D:
6434			case TGSI_TEXTURE_SHADOW1D:
6435			case TGSI_TEXTURE_1D_ARRAY:
6436			case TGSI_TEXTURE_SHADOW1D_ARRAY:
6437				offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
6438			}
6439		}
6440	}
6441
6442	/* Obtain the sample index for reading a compressed MSAA color texture.
6443	 * To read the FMASK, we use the ldfptr instruction, which tells us
6444	 * where the samples are stored.
6445	 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
6446	 * which is the identity mapping. Each nibble says which physical sample
6447	 * should be fetched to get that sample.
6448	 *
6449	 * Assume src.z contains the sample index. It should be modified like this:
6450	 *   src.z = (ldfptr() >> (src.z * 4)) & 0xF;
6451	 * Then fetch the texel with src.
6452	 */
6453	if (read_compressed_msaa) {
6454		unsigned sample_chan = 3;
6455		unsigned temp = r600_get_temp(ctx);
6456		assert(src_loaded);
6457
6458		/* temp.w = ldfptr() */
6459		memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6460		tex.op = FETCH_OP_LD;
6461		tex.inst_mod = 1; /* to indicate this is ldfptr */
6462		tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6463		tex.sampler_index_mode = sampler_index_mode;
6464		tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6465		tex.resource_index_mode = sampler_index_mode;
6466		tex.src_gpr = src_gpr;
6467		tex.dst_gpr = temp;
6468		tex.dst_sel_x = 7; /* mask out these components */
6469		tex.dst_sel_y = 7;
6470		tex.dst_sel_z = 7;
6471		tex.dst_sel_w = 0; /* store X */
6472		tex.src_sel_x = 0;
6473		tex.src_sel_y = 1;
6474		tex.src_sel_z = 2;
6475		tex.src_sel_w = 3;
6476		tex.offset_x = offset_x;
6477		tex.offset_y = offset_y;
6478		tex.offset_z = offset_z;
6479		r = r600_bytecode_add_tex(ctx->bc, &tex);
6480		if (r)
6481			return r;
6482
6483		/* temp.x = sample_index*4 */
6484		if (ctx->bc->chip_class == CAYMAN) {
6485			for (i = 0 ; i < 4; i++) {
6486				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6487				alu.op = ALU_OP2_MULLO_INT;
6488				alu.src[0].sel = src_gpr;
6489				alu.src[0].chan = sample_chan;
6490				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6491				alu.src[1].value = 4;
6492				alu.dst.sel = temp;
6493				alu.dst.chan = i;
6494				alu.dst.write = i == 0;
6495				if (i == 3)
6496					alu.last = 1;
6497				r = r600_bytecode_add_alu(ctx->bc, &alu);
6498				if (r)
6499					return r;
6500			}
6501		} else {
6502			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6503			alu.op = ALU_OP2_MULLO_INT;
6504			alu.src[0].sel = src_gpr;
6505			alu.src[0].chan = sample_chan;
6506			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6507			alu.src[1].value = 4;
6508			alu.dst.sel = temp;
6509			alu.dst.chan = 0;
6510			alu.dst.write = 1;
6511			alu.last = 1;
6512			r = r600_bytecode_add_alu(ctx->bc, &alu);
6513			if (r)
6514				return r;
6515		}
6516
6517		/* sample_index = temp.w >> temp.x */
6518		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6519		alu.op = ALU_OP2_LSHR_INT;
6520		alu.src[0].sel = temp;
6521		alu.src[0].chan = 3;
6522		alu.src[1].sel = temp;
6523		alu.src[1].chan = 0;
6524		alu.dst.sel = src_gpr;
6525		alu.dst.chan = sample_chan;
6526		alu.dst.write = 1;
6527		alu.last = 1;
6528		r = r600_bytecode_add_alu(ctx->bc, &alu);
6529		if (r)
6530			return r;
6531
6532		/* sample_index & 0xF */
6533		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6534		alu.op = ALU_OP2_AND_INT;
6535		alu.src[0].sel = src_gpr;
6536		alu.src[0].chan = sample_chan;
6537		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6538		alu.src[1].value = 0xF;
6539		alu.dst.sel = src_gpr;
6540		alu.dst.chan = sample_chan;
6541		alu.dst.write = 1;
6542		alu.last = 1;
6543		r = r600_bytecode_add_alu(ctx->bc, &alu);
6544		if (r)
6545			return r;
6546#if 0
6547		/* visualize the FMASK */
6548		for (i = 0; i < 4; i++) {
6549			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6550			alu.op = ALU_OP1_INT_TO_FLT;
6551			alu.src[0].sel = src_gpr;
6552			alu.src[0].chan = sample_chan;
6553			alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
6554			alu.dst.chan = i;
6555			alu.dst.write = 1;
6556			alu.last = 1;
6557			r = r600_bytecode_add_alu(ctx->bc, &alu);
6558			if (r)
6559				return r;
6560		}
6561		return 0;
6562#endif
6563	}
6564
6565	/* does this shader want a num layers from TXQ for a cube array? */
6566	if (has_txq_cube_array_z) {
6567		int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6568
6569		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6570		alu.op = ALU_OP1_MOV;
6571
6572		alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
6573		if (ctx->bc->chip_class >= EVERGREEN) {
6574			/* channel 1 or 3 of each word */
6575			alu.src[0].sel += (id / 2);
6576			alu.src[0].chan = ((id % 2) * 2) + 1;
6577		} else {
6578			/* r600 we have them at channel 2 of the second dword */
6579			alu.src[0].sel += (id * 2) + 1;
6580			alu.src[0].chan = 2;
6581		}
6582		alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6583		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
6584		alu.last = 1;
6585		r = r600_bytecode_add_alu(ctx->bc, &alu);
6586		if (r)
6587			return r;
6588		/* disable writemask from texture instruction */
6589		inst->Dst[0].Register.WriteMask &= ~4;
6590	}
6591
6592	opcode = ctx->inst_info->op;
6593	if (opcode == FETCH_OP_GATHER4 &&
6594		inst->TexOffsets[0].File != TGSI_FILE_NULL &&
6595		inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
6596		opcode = FETCH_OP_GATHER4_O;
6597
6598		/* GATHER4_O/GATHER4_C_O use offset values loaded by
6599		   SET_TEXTURE_OFFSETS instruction. The immediate offset values
6600		   encoded in the instruction are ignored. */
6601		memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6602		tex.op = FETCH_OP_SET_TEXTURE_OFFSETS;
6603		tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6604		tex.sampler_index_mode = sampler_index_mode;
6605		tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6606		tex.resource_index_mode = sampler_index_mode;
6607
6608		tex.src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
6609		tex.src_sel_x = inst->TexOffsets[0].SwizzleX;
6610		tex.src_sel_y = inst->TexOffsets[0].SwizzleY;
6611		tex.src_sel_z = inst->TexOffsets[0].SwizzleZ;
6612		tex.src_sel_w = 4;
6613
6614		tex.dst_sel_x = 7;
6615		tex.dst_sel_y = 7;
6616		tex.dst_sel_z = 7;
6617		tex.dst_sel_w = 7;
6618
6619		r = r600_bytecode_add_tex(ctx->bc, &tex);
6620		if (r)
6621			return r;
6622	}
6623
6624	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
6625	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
6626	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
6627	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
6628	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
6629	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
6630	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
6631		switch (opcode) {
6632		case FETCH_OP_SAMPLE:
6633			opcode = FETCH_OP_SAMPLE_C;
6634			break;
6635		case FETCH_OP_SAMPLE_L:
6636			opcode = FETCH_OP_SAMPLE_C_L;
6637			break;
6638		case FETCH_OP_SAMPLE_LB:
6639			opcode = FETCH_OP_SAMPLE_C_LB;
6640			break;
6641		case FETCH_OP_SAMPLE_G:
6642			opcode = FETCH_OP_SAMPLE_C_G;
6643			break;
6644		/* Texture gather variants */
6645		case FETCH_OP_GATHER4:
6646			opcode = FETCH_OP_GATHER4_C;
6647			break;
6648		case FETCH_OP_GATHER4_O:
6649			opcode = FETCH_OP_GATHER4_C_O;
6650			break;
6651		}
6652	}
6653
6654	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6655	tex.op = opcode;
6656
6657	tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6658	tex.sampler_index_mode = sampler_index_mode;
6659	tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6660	tex.resource_index_mode = sampler_index_mode;
6661	tex.src_gpr = src_gpr;
6662	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
6663
6664	if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
6665		inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
6666		tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
6667	}
6668
6669	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
6670		int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
6671		tex.inst_mod = texture_component_select;
6672
6673		if (ctx->bc->chip_class == CAYMAN) {
6674		/* GATHER4 result order is different from TGSI TG4 */
6675			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 0 : 7;
6676			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 1 : 7;
6677			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 2 : 7;
6678			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
6679		} else {
6680			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
6681			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
6682			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
6683			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
6684		}
6685	}
6686	else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
6687		tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
6688		tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
6689		tex.dst_sel_z = 7;
6690		tex.dst_sel_w = 7;
6691	}
6692	else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
6693		tex.dst_sel_x = 3;
6694		tex.dst_sel_y = 7;
6695		tex.dst_sel_z = 7;
6696		tex.dst_sel_w = 7;
6697	}
6698	else {
6699		tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
6700		tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
6701		tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
6702		tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
6703	}
6704
6705
6706	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ||
6707	    inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
6708		tex.src_sel_x = 4;
6709		tex.src_sel_y = 4;
6710		tex.src_sel_z = 4;
6711		tex.src_sel_w = 4;
6712	} else if (src_loaded) {
6713		tex.src_sel_x = 0;
6714		tex.src_sel_y = 1;
6715		tex.src_sel_z = 2;
6716		tex.src_sel_w = 3;
6717	} else {
6718		tex.src_sel_x = ctx->src[0].swizzle[0];
6719		tex.src_sel_y = ctx->src[0].swizzle[1];
6720		tex.src_sel_z = ctx->src[0].swizzle[2];
6721		tex.src_sel_w = ctx->src[0].swizzle[3];
6722		tex.src_rel = ctx->src[0].rel;
6723	}
6724
6725	if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
6726	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
6727	    inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6728	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
6729		tex.src_sel_x = 1;
6730		tex.src_sel_y = 0;
6731		tex.src_sel_z = 3;
6732		tex.src_sel_w = 2; /* route Z compare or Lod value into W */
6733	}
6734
6735	if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
6736	    inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
6737		tex.coord_type_x = 1;
6738		tex.coord_type_y = 1;
6739	}
6740	tex.coord_type_z = 1;
6741	tex.coord_type_w = 1;
6742
6743	tex.offset_x = offset_x;
6744	tex.offset_y = offset_y;
6745	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
6746		(inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
6747		 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
6748		tex.offset_z = 0;
6749	}
6750	else {
6751		tex.offset_z = offset_z;
6752	}
6753
6754	/* Put the depth for comparison in W.
6755	 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
6756	 * Some instructions expect the depth in Z. */
6757	if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
6758	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
6759	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
6760	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
6761	    opcode != FETCH_OP_SAMPLE_C_L &&
6762	    opcode != FETCH_OP_SAMPLE_C_LB) {
6763		tex.src_sel_w = tex.src_sel_z;
6764	}
6765
6766	if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
6767	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
6768		if (opcode == FETCH_OP_SAMPLE_C_L ||
6769		    opcode == FETCH_OP_SAMPLE_C_LB) {
6770			/* the array index is read from Y */
6771			tex.coord_type_y = 0;
6772		} else {
6773			/* the array index is read from Z */
6774			tex.coord_type_z = 0;
6775			tex.src_sel_z = tex.src_sel_y;
6776		}
6777	} else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
6778		   inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
6779		   ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6780		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
6781		    (ctx->bc->chip_class >= EVERGREEN)))
6782		/* the array index is read from Z */
6783		tex.coord_type_z = 0;
6784
6785	/* mask unused source components */
6786	if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
6787		switch (inst->Texture.Texture) {
6788		case TGSI_TEXTURE_2D:
6789		case TGSI_TEXTURE_RECT:
6790			tex.src_sel_z = 7;
6791			tex.src_sel_w = 7;
6792			break;
6793		case TGSI_TEXTURE_1D_ARRAY:
6794			tex.src_sel_y = 7;
6795			tex.src_sel_w = 7;
6796			break;
6797		case TGSI_TEXTURE_1D:
6798			tex.src_sel_y = 7;
6799			tex.src_sel_z = 7;
6800			tex.src_sel_w = 7;
6801			break;
6802		}
6803	}
6804
6805	r = r600_bytecode_add_tex(ctx->bc, &tex);
6806	if (r)
6807		return r;
6808
6809	/* add shadow ambient support  - gallium doesn't do it yet */
6810	return 0;
6811}
6812
6813static int tgsi_lrp(struct r600_shader_ctx *ctx)
6814{
6815	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6816	struct r600_bytecode_alu alu;
6817	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6818	unsigned i, temp_regs[2];
6819	int r;
6820
6821	/* optimize if it's just an equal balance */
6822	if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
6823		for (i = 0; i < lasti + 1; i++) {
6824			if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6825				continue;
6826
6827			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6828			alu.op = ALU_OP2_ADD;
6829			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6830			r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6831			alu.omod = 3;
6832			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6833			alu.dst.chan = i;
6834			if (i == lasti) {
6835				alu.last = 1;
6836			}
6837			r = r600_bytecode_add_alu(ctx->bc, &alu);
6838			if (r)
6839				return r;
6840		}
6841		return 0;
6842	}
6843
6844	/* 1 - src0 */
6845	for (i = 0; i < lasti + 1; i++) {
6846		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6847			continue;
6848
6849		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6850		alu.op = ALU_OP2_ADD;
6851		alu.src[0].sel = V_SQ_ALU_SRC_1;
6852		alu.src[0].chan = 0;
6853		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6854		r600_bytecode_src_toggle_neg(&alu.src[1]);
6855		alu.dst.sel = ctx->temp_reg;
6856		alu.dst.chan = i;
6857		if (i == lasti) {
6858			alu.last = 1;
6859		}
6860		alu.dst.write = 1;
6861		r = r600_bytecode_add_alu(ctx->bc, &alu);
6862		if (r)
6863			return r;
6864	}
6865
6866	/* (1 - src0) * src2 */
6867	for (i = 0; i < lasti + 1; i++) {
6868		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6869			continue;
6870
6871		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6872		alu.op = ALU_OP2_MUL;
6873		alu.src[0].sel = ctx->temp_reg;
6874		alu.src[0].chan = i;
6875		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6876		alu.dst.sel = ctx->temp_reg;
6877		alu.dst.chan = i;
6878		if (i == lasti) {
6879			alu.last = 1;
6880		}
6881		alu.dst.write = 1;
6882		r = r600_bytecode_add_alu(ctx->bc, &alu);
6883		if (r)
6884			return r;
6885	}
6886
6887	/* src0 * src1 + (1 - src0) * src2 */
6888        if (ctx->src[0].abs)
6889		temp_regs[0] = r600_get_temp(ctx);
6890	else
6891		temp_regs[0] = 0;
6892	if (ctx->src[1].abs)
6893		temp_regs[1] = r600_get_temp(ctx);
6894	else
6895		temp_regs[1] = 0;
6896
6897	for (i = 0; i < lasti + 1; i++) {
6898		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6899			continue;
6900
6901		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6902		alu.op = ALU_OP3_MULADD;
6903		alu.is_op3 = 1;
6904		r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
6905		if (r)
6906			return r;
6907		r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[1], &ctx->src[1]);
6908		if (r)
6909			return r;
6910		alu.src[2].sel = ctx->temp_reg;
6911		alu.src[2].chan = i;
6912
6913		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6914		alu.dst.chan = i;
6915		if (i == lasti) {
6916			alu.last = 1;
6917		}
6918		r = r600_bytecode_add_alu(ctx->bc, &alu);
6919		if (r)
6920			return r;
6921	}
6922	return 0;
6923}
6924
6925static int tgsi_cmp(struct r600_shader_ctx *ctx)
6926{
6927	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6928	struct r600_bytecode_alu alu;
6929	int i, r, j;
6930	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6931	int temp_regs[3];
6932
6933	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6934		temp_regs[j] = 0;
6935		if (ctx->src[j].abs)
6936			temp_regs[j] = r600_get_temp(ctx);
6937	}
6938
6939	for (i = 0; i < lasti + 1; i++) {
6940		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6941			continue;
6942
6943		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6944		alu.op = ALU_OP3_CNDGE;
6945		r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
6946		if (r)
6947			return r;
6948		r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[1], &ctx->src[2]);
6949		if (r)
6950			return r;
6951		r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[2], &ctx->src[1]);
6952		if (r)
6953			return r;
6954		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6955		alu.dst.chan = i;
6956		alu.dst.write = 1;
6957		alu.is_op3 = 1;
6958		if (i == lasti)
6959			alu.last = 1;
6960		r = r600_bytecode_add_alu(ctx->bc, &alu);
6961		if (r)
6962			return r;
6963	}
6964	return 0;
6965}
6966
6967static int tgsi_ucmp(struct r600_shader_ctx *ctx)
6968{
6969	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6970	struct r600_bytecode_alu alu;
6971	int i, r;
6972	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6973
6974	for (i = 0; i < lasti + 1; i++) {
6975		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6976			continue;
6977
6978		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6979		alu.op = ALU_OP3_CNDE_INT;
6980		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6981		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6982		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
6983		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6984		alu.dst.chan = i;
6985		alu.dst.write = 1;
6986		alu.is_op3 = 1;
6987		if (i == lasti)
6988			alu.last = 1;
6989		r = r600_bytecode_add_alu(ctx->bc, &alu);
6990		if (r)
6991			return r;
6992	}
6993	return 0;
6994}
6995
6996static int tgsi_xpd(struct r600_shader_ctx *ctx)
6997{
6998	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6999	static const unsigned int src0_swizzle[] = {2, 0, 1};
7000	static const unsigned int src1_swizzle[] = {1, 2, 0};
7001	struct r600_bytecode_alu alu;
7002	uint32_t use_temp = 0;
7003	int i, r;
7004
7005	if (inst->Dst[0].Register.WriteMask != 0xf)
7006		use_temp = 1;
7007
7008	for (i = 0; i < 4; i++) {
7009		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7010		alu.op = ALU_OP2_MUL;
7011		if (i < 3) {
7012			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
7013			r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
7014		} else {
7015			alu.src[0].sel = V_SQ_ALU_SRC_0;
7016			alu.src[0].chan = i;
7017			alu.src[1].sel = V_SQ_ALU_SRC_0;
7018			alu.src[1].chan = i;
7019		}
7020
7021		alu.dst.sel = ctx->temp_reg;
7022		alu.dst.chan = i;
7023		alu.dst.write = 1;
7024
7025		if (i == 3)
7026			alu.last = 1;
7027		r = r600_bytecode_add_alu(ctx->bc, &alu);
7028		if (r)
7029			return r;
7030	}
7031
7032	for (i = 0; i < 4; i++) {
7033		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7034		alu.op = ALU_OP3_MULADD;
7035
7036		if (i < 3) {
7037			r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
7038			r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
7039		} else {
7040			alu.src[0].sel = V_SQ_ALU_SRC_0;
7041			alu.src[0].chan = i;
7042			alu.src[1].sel = V_SQ_ALU_SRC_0;
7043			alu.src[1].chan = i;
7044		}
7045
7046		alu.src[2].sel = ctx->temp_reg;
7047		alu.src[2].neg = 1;
7048		alu.src[2].chan = i;
7049
7050		if (use_temp)
7051			alu.dst.sel = ctx->temp_reg;
7052		else
7053			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7054		alu.dst.chan = i;
7055		alu.dst.write = 1;
7056		alu.is_op3 = 1;
7057		if (i == 3)
7058			alu.last = 1;
7059		r = r600_bytecode_add_alu(ctx->bc, &alu);
7060		if (r)
7061			return r;
7062	}
7063	if (use_temp)
7064		return tgsi_helper_copy(ctx, inst);
7065	return 0;
7066}
7067
7068static int tgsi_exp(struct r600_shader_ctx *ctx)
7069{
7070	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7071	struct r600_bytecode_alu alu;
7072	int r;
7073	int i;
7074
7075	/* result.x = 2^floor(src); */
7076	if (inst->Dst[0].Register.WriteMask & 1) {
7077		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7078
7079		alu.op = ALU_OP1_FLOOR;
7080		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7081
7082		alu.dst.sel = ctx->temp_reg;
7083		alu.dst.chan = 0;
7084		alu.dst.write = 1;
7085		alu.last = 1;
7086		r = r600_bytecode_add_alu(ctx->bc, &alu);
7087		if (r)
7088			return r;
7089
7090		if (ctx->bc->chip_class == CAYMAN) {
7091			for (i = 0; i < 3; i++) {
7092				alu.op = ALU_OP1_EXP_IEEE;
7093				alu.src[0].sel = ctx->temp_reg;
7094				alu.src[0].chan = 0;
7095
7096				alu.dst.sel = ctx->temp_reg;
7097				alu.dst.chan = i;
7098				alu.dst.write = i == 0;
7099				alu.last = i == 2;
7100				r = r600_bytecode_add_alu(ctx->bc, &alu);
7101				if (r)
7102					return r;
7103			}
7104		} else {
7105			alu.op = ALU_OP1_EXP_IEEE;
7106			alu.src[0].sel = ctx->temp_reg;
7107			alu.src[0].chan = 0;
7108
7109			alu.dst.sel = ctx->temp_reg;
7110			alu.dst.chan = 0;
7111			alu.dst.write = 1;
7112			alu.last = 1;
7113			r = r600_bytecode_add_alu(ctx->bc, &alu);
7114			if (r)
7115				return r;
7116		}
7117	}
7118
7119	/* result.y = tmp - floor(tmp); */
7120	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
7121		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7122
7123		alu.op = ALU_OP1_FRACT;
7124		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7125
7126		alu.dst.sel = ctx->temp_reg;
7127#if 0
7128		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7129		if (r)
7130			return r;
7131#endif
7132		alu.dst.write = 1;
7133		alu.dst.chan = 1;
7134
7135		alu.last = 1;
7136
7137		r = r600_bytecode_add_alu(ctx->bc, &alu);
7138		if (r)
7139			return r;
7140	}
7141
7142	/* result.z = RoughApprox2ToX(tmp);*/
7143	if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
7144		if (ctx->bc->chip_class == CAYMAN) {
7145			for (i = 0; i < 3; i++) {
7146				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7147				alu.op = ALU_OP1_EXP_IEEE;
7148				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7149
7150				alu.dst.sel = ctx->temp_reg;
7151				alu.dst.chan = i;
7152				if (i == 2) {
7153					alu.dst.write = 1;
7154					alu.last = 1;
7155				}
7156
7157				r = r600_bytecode_add_alu(ctx->bc, &alu);
7158				if (r)
7159					return r;
7160			}
7161		} else {
7162			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7163			alu.op = ALU_OP1_EXP_IEEE;
7164			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7165
7166			alu.dst.sel = ctx->temp_reg;
7167			alu.dst.write = 1;
7168			alu.dst.chan = 2;
7169
7170			alu.last = 1;
7171
7172			r = r600_bytecode_add_alu(ctx->bc, &alu);
7173			if (r)
7174				return r;
7175		}
7176	}
7177
7178	/* result.w = 1.0;*/
7179	if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
7180		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7181
7182		alu.op = ALU_OP1_MOV;
7183		alu.src[0].sel = V_SQ_ALU_SRC_1;
7184		alu.src[0].chan = 0;
7185
7186		alu.dst.sel = ctx->temp_reg;
7187		alu.dst.chan = 3;
7188		alu.dst.write = 1;
7189		alu.last = 1;
7190		r = r600_bytecode_add_alu(ctx->bc, &alu);
7191		if (r)
7192			return r;
7193	}
7194	return tgsi_helper_copy(ctx, inst);
7195}
7196
7197static int tgsi_log(struct r600_shader_ctx *ctx)
7198{
7199	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7200	struct r600_bytecode_alu alu;
7201	int r;
7202	int i;
7203
7204	/* result.x = floor(log2(|src|)); */
7205	if (inst->Dst[0].Register.WriteMask & 1) {
7206		if (ctx->bc->chip_class == CAYMAN) {
7207			for (i = 0; i < 3; i++) {
7208				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7209
7210				alu.op = ALU_OP1_LOG_IEEE;
7211				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7212				r600_bytecode_src_set_abs(&alu.src[0]);
7213
7214				alu.dst.sel = ctx->temp_reg;
7215				alu.dst.chan = i;
7216				if (i == 0)
7217					alu.dst.write = 1;
7218				if (i == 2)
7219					alu.last = 1;
7220				r = r600_bytecode_add_alu(ctx->bc, &alu);
7221				if (r)
7222					return r;
7223			}
7224
7225		} else {
7226			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7227
7228			alu.op = ALU_OP1_LOG_IEEE;
7229			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7230			r600_bytecode_src_set_abs(&alu.src[0]);
7231
7232			alu.dst.sel = ctx->temp_reg;
7233			alu.dst.chan = 0;
7234			alu.dst.write = 1;
7235			alu.last = 1;
7236			r = r600_bytecode_add_alu(ctx->bc, &alu);
7237			if (r)
7238				return r;
7239		}
7240
7241		alu.op = ALU_OP1_FLOOR;
7242		alu.src[0].sel = ctx->temp_reg;
7243		alu.src[0].chan = 0;
7244
7245		alu.dst.sel = ctx->temp_reg;
7246		alu.dst.chan = 0;
7247		alu.dst.write = 1;
7248		alu.last = 1;
7249
7250		r = r600_bytecode_add_alu(ctx->bc, &alu);
7251		if (r)
7252			return r;
7253	}
7254
7255	/* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
7256	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
7257
7258		if (ctx->bc->chip_class == CAYMAN) {
7259			for (i = 0; i < 3; i++) {
7260				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7261
7262				alu.op = ALU_OP1_LOG_IEEE;
7263				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7264				r600_bytecode_src_set_abs(&alu.src[0]);
7265
7266				alu.dst.sel = ctx->temp_reg;
7267				alu.dst.chan = i;
7268				if (i == 1)
7269					alu.dst.write = 1;
7270				if (i == 2)
7271					alu.last = 1;
7272
7273				r = r600_bytecode_add_alu(ctx->bc, &alu);
7274				if (r)
7275					return r;
7276			}
7277		} else {
7278			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7279
7280			alu.op = ALU_OP1_LOG_IEEE;
7281			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7282			r600_bytecode_src_set_abs(&alu.src[0]);
7283
7284			alu.dst.sel = ctx->temp_reg;
7285			alu.dst.chan = 1;
7286			alu.dst.write = 1;
7287			alu.last = 1;
7288
7289			r = r600_bytecode_add_alu(ctx->bc, &alu);
7290			if (r)
7291				return r;
7292		}
7293
7294		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7295
7296		alu.op = ALU_OP1_FLOOR;
7297		alu.src[0].sel = ctx->temp_reg;
7298		alu.src[0].chan = 1;
7299
7300		alu.dst.sel = ctx->temp_reg;
7301		alu.dst.chan = 1;
7302		alu.dst.write = 1;
7303		alu.last = 1;
7304
7305		r = r600_bytecode_add_alu(ctx->bc, &alu);
7306		if (r)
7307			return r;
7308
7309		if (ctx->bc->chip_class == CAYMAN) {
7310			for (i = 0; i < 3; i++) {
7311				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7312				alu.op = ALU_OP1_EXP_IEEE;
7313				alu.src[0].sel = ctx->temp_reg;
7314				alu.src[0].chan = 1;
7315
7316				alu.dst.sel = ctx->temp_reg;
7317				alu.dst.chan = i;
7318				if (i == 1)
7319					alu.dst.write = 1;
7320				if (i == 2)
7321					alu.last = 1;
7322
7323				r = r600_bytecode_add_alu(ctx->bc, &alu);
7324				if (r)
7325					return r;
7326			}
7327		} else {
7328			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7329			alu.op = ALU_OP1_EXP_IEEE;
7330			alu.src[0].sel = ctx->temp_reg;
7331			alu.src[0].chan = 1;
7332
7333			alu.dst.sel = ctx->temp_reg;
7334			alu.dst.chan = 1;
7335			alu.dst.write = 1;
7336			alu.last = 1;
7337
7338			r = r600_bytecode_add_alu(ctx->bc, &alu);
7339			if (r)
7340				return r;
7341		}
7342
7343		if (ctx->bc->chip_class == CAYMAN) {
7344			for (i = 0; i < 3; i++) {
7345				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7346				alu.op = ALU_OP1_RECIP_IEEE;
7347				alu.src[0].sel = ctx->temp_reg;
7348				alu.src[0].chan = 1;
7349
7350				alu.dst.sel = ctx->temp_reg;
7351				alu.dst.chan = i;
7352				if (i == 1)
7353					alu.dst.write = 1;
7354				if (i == 2)
7355					alu.last = 1;
7356
7357				r = r600_bytecode_add_alu(ctx->bc, &alu);
7358				if (r)
7359					return r;
7360			}
7361		} else {
7362			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7363			alu.op = ALU_OP1_RECIP_IEEE;
7364			alu.src[0].sel = ctx->temp_reg;
7365			alu.src[0].chan = 1;
7366
7367			alu.dst.sel = ctx->temp_reg;
7368			alu.dst.chan = 1;
7369			alu.dst.write = 1;
7370			alu.last = 1;
7371
7372			r = r600_bytecode_add_alu(ctx->bc, &alu);
7373			if (r)
7374				return r;
7375		}
7376
7377		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7378
7379		alu.op = ALU_OP2_MUL;
7380
7381		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7382		r600_bytecode_src_set_abs(&alu.src[0]);
7383
7384		alu.src[1].sel = ctx->temp_reg;
7385		alu.src[1].chan = 1;
7386
7387		alu.dst.sel = ctx->temp_reg;
7388		alu.dst.chan = 1;
7389		alu.dst.write = 1;
7390		alu.last = 1;
7391
7392		r = r600_bytecode_add_alu(ctx->bc, &alu);
7393		if (r)
7394			return r;
7395	}
7396
7397	/* result.z = log2(|src|);*/
7398	if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
7399		if (ctx->bc->chip_class == CAYMAN) {
7400			for (i = 0; i < 3; i++) {
7401				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7402
7403				alu.op = ALU_OP1_LOG_IEEE;
7404				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7405				r600_bytecode_src_set_abs(&alu.src[0]);
7406
7407				alu.dst.sel = ctx->temp_reg;
7408				if (i == 2)
7409					alu.dst.write = 1;
7410				alu.dst.chan = i;
7411				if (i == 2)
7412					alu.last = 1;
7413
7414				r = r600_bytecode_add_alu(ctx->bc, &alu);
7415				if (r)
7416					return r;
7417			}
7418		} else {
7419			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7420
7421			alu.op = ALU_OP1_LOG_IEEE;
7422			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7423			r600_bytecode_src_set_abs(&alu.src[0]);
7424
7425			alu.dst.sel = ctx->temp_reg;
7426			alu.dst.write = 1;
7427			alu.dst.chan = 2;
7428			alu.last = 1;
7429
7430			r = r600_bytecode_add_alu(ctx->bc, &alu);
7431			if (r)
7432				return r;
7433		}
7434	}
7435
7436	/* result.w = 1.0; */
7437	if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
7438		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7439
7440		alu.op = ALU_OP1_MOV;
7441		alu.src[0].sel = V_SQ_ALU_SRC_1;
7442		alu.src[0].chan = 0;
7443
7444		alu.dst.sel = ctx->temp_reg;
7445		alu.dst.chan = 3;
7446		alu.dst.write = 1;
7447		alu.last = 1;
7448
7449		r = r600_bytecode_add_alu(ctx->bc, &alu);
7450		if (r)
7451			return r;
7452	}
7453
7454	return tgsi_helper_copy(ctx, inst);
7455}
7456
7457static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
7458{
7459	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7460	struct r600_bytecode_alu alu;
7461	int r;
7462	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7463	unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
7464
7465	assert(inst->Dst[0].Register.Index < 3);
7466	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7467
7468	switch (inst->Instruction.Opcode) {
7469	case TGSI_OPCODE_ARL:
7470		alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
7471		break;
7472	case TGSI_OPCODE_ARR:
7473		alu.op = ALU_OP1_FLT_TO_INT;
7474		break;
7475	case TGSI_OPCODE_UARL:
7476		alu.op = ALU_OP1_MOV;
7477		break;
7478	default:
7479		assert(0);
7480		return -1;
7481	}
7482
7483	for (i = 0; i <= lasti; ++i) {
7484		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7485			continue;
7486		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7487		alu.last = i == lasti;
7488		alu.dst.sel = reg;
7489	        alu.dst.chan = i;
7490		alu.dst.write = 1;
7491		r = r600_bytecode_add_alu(ctx->bc, &alu);
7492		if (r)
7493			return r;
7494	}
7495
7496	if (inst->Dst[0].Register.Index > 0)
7497		ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
7498	else
7499		ctx->bc->ar_loaded = 0;
7500
7501	return 0;
7502}
7503static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
7504{
7505	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7506	struct r600_bytecode_alu alu;
7507	int r;
7508	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7509
7510	switch (inst->Instruction.Opcode) {
7511	case TGSI_OPCODE_ARL:
7512		memset(&alu, 0, sizeof(alu));
7513		alu.op = ALU_OP1_FLOOR;
7514		alu.dst.sel = ctx->bc->ar_reg;
7515		alu.dst.write = 1;
7516		for (i = 0; i <= lasti; ++i) {
7517			if (inst->Dst[0].Register.WriteMask & (1 << i))  {
7518				alu.dst.chan = i;
7519				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7520				alu.last = i == lasti;
7521				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
7522					return r;
7523			}
7524		}
7525
7526		memset(&alu, 0, sizeof(alu));
7527		alu.op = ALU_OP1_FLT_TO_INT;
7528		alu.src[0].sel = ctx->bc->ar_reg;
7529		alu.dst.sel = ctx->bc->ar_reg;
7530		alu.dst.write = 1;
7531		/* FLT_TO_INT is trans-only on r600/r700 */
7532		alu.last = TRUE;
7533		for (i = 0; i <= lasti; ++i) {
7534			alu.dst.chan = i;
7535			alu.src[0].chan = i;
7536			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
7537				return r;
7538		}
7539		break;
7540	case TGSI_OPCODE_ARR:
7541		memset(&alu, 0, sizeof(alu));
7542		alu.op = ALU_OP1_FLT_TO_INT;
7543		alu.dst.sel = ctx->bc->ar_reg;
7544		alu.dst.write = 1;
7545		/* FLT_TO_INT is trans-only on r600/r700 */
7546		alu.last = TRUE;
7547		for (i = 0; i <= lasti; ++i) {
7548			if (inst->Dst[0].Register.WriteMask & (1 << i)) {
7549				alu.dst.chan = i;
7550				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7551				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
7552					return r;
7553			}
7554		}
7555		break;
7556	case TGSI_OPCODE_UARL:
7557		memset(&alu, 0, sizeof(alu));
7558		alu.op = ALU_OP1_MOV;
7559		alu.dst.sel = ctx->bc->ar_reg;
7560		alu.dst.write = 1;
7561		for (i = 0; i <= lasti; ++i) {
7562			if (inst->Dst[0].Register.WriteMask & (1 << i)) {
7563				alu.dst.chan = i;
7564				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7565				alu.last = i == lasti;
7566				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
7567					return r;
7568			}
7569		}
7570		break;
7571	default:
7572		assert(0);
7573		return -1;
7574	}
7575
7576	ctx->bc->ar_loaded = 0;
7577	return 0;
7578}
7579
7580static int tgsi_opdst(struct r600_shader_ctx *ctx)
7581{
7582	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7583	struct r600_bytecode_alu alu;
7584	int i, r = 0;
7585
7586	for (i = 0; i < 4; i++) {
7587		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7588
7589		alu.op = ALU_OP2_MUL;
7590		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7591
7592		if (i == 0 || i == 3) {
7593			alu.src[0].sel = V_SQ_ALU_SRC_1;
7594		} else {
7595			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7596		}
7597
7598		if (i == 0 || i == 2) {
7599			alu.src[1].sel = V_SQ_ALU_SRC_1;
7600		} else {
7601			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
7602		}
7603		if (i == 3)
7604			alu.last = 1;
7605		r = r600_bytecode_add_alu(ctx->bc, &alu);
7606		if (r)
7607			return r;
7608	}
7609	return 0;
7610}
7611
7612static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type)
7613{
7614	struct r600_bytecode_alu alu;
7615	int r;
7616
7617	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7618	alu.op = opcode;
7619	alu.execute_mask = 1;
7620	alu.update_pred = 1;
7621
7622	alu.dst.sel = ctx->temp_reg;
7623	alu.dst.write = 1;
7624	alu.dst.chan = 0;
7625
7626	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7627	alu.src[1].sel = V_SQ_ALU_SRC_0;
7628	alu.src[1].chan = 0;
7629
7630	alu.last = 1;
7631
7632	r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
7633	if (r)
7634		return r;
7635	return 0;
7636}
7637
7638static int pops(struct r600_shader_ctx *ctx, int pops)
7639{
7640	unsigned force_pop = ctx->bc->force_add_cf;
7641
7642	if (!force_pop) {
7643		int alu_pop = 3;
7644		if (ctx->bc->cf_last) {
7645			if (ctx->bc->cf_last->op == CF_OP_ALU)
7646				alu_pop = 0;
7647			else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
7648				alu_pop = 1;
7649		}
7650		alu_pop += pops;
7651		if (alu_pop == 1) {
7652			ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
7653			ctx->bc->force_add_cf = 1;
7654		} else if (alu_pop == 2) {
7655			ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
7656			ctx->bc->force_add_cf = 1;
7657		} else {
7658			force_pop = 1;
7659		}
7660	}
7661
7662	if (force_pop) {
7663		r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
7664		ctx->bc->cf_last->pop_count = pops;
7665		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
7666	}
7667
7668	return 0;
7669}
7670
7671static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
7672                                              unsigned reason)
7673{
7674	struct r600_stack_info *stack = &ctx->bc->stack;
7675	unsigned elements, entries;
7676
7677	unsigned entry_size = stack->entry_size;
7678
7679	elements = (stack->loop + stack->push_wqm ) * entry_size;
7680	elements += stack->push;
7681
7682	switch (ctx->bc->chip_class) {
7683	case R600:
7684	case R700:
7685		/* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
7686		 * the stack must be reserved to hold the current active/continue
7687		 * masks */
7688		if (reason == FC_PUSH_VPM) {
7689			elements += 2;
7690		}
7691		break;
7692
7693	case CAYMAN:
7694		/* r9xx: any stack operation on empty stack consumes 2 additional
7695		 * elements */
7696		elements += 2;
7697
7698		/* fallthrough */
7699		/* FIXME: do the two elements added above cover the cases for the
7700		 * r8xx+ below? */
7701
7702	case EVERGREEN:
7703		/* r8xx+: 2 extra elements are not always required, but one extra
7704		 * element must be added for each of the following cases:
7705		 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
7706		 *    stack usage.
7707		 *    (Currently we don't use ALU_ELSE_AFTER.)
7708		 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
7709		 *    PUSH instruction executed.
7710		 *
7711		 *    NOTE: it seems we also need to reserve additional element in some
7712		 *    other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
7713		 *    then STACK_SIZE should be 2 instead of 1 */
7714		if (reason == FC_PUSH_VPM) {
7715			elements += 1;
7716		}
7717		break;
7718
7719	default:
7720		assert(0);
7721		break;
7722	}
7723
7724	/* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
7725	 * for all chips, so we use 4 in the final formula, not the real entry_size
7726	 * for the chip */
7727	entry_size = 4;
7728
7729	entries = (elements + (entry_size - 1)) / entry_size;
7730
7731	if (entries > stack->max_entries)
7732		stack->max_entries = entries;
7733}
7734
7735static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
7736{
7737	switch(reason) {
7738	case FC_PUSH_VPM:
7739		--ctx->bc->stack.push;
7740		assert(ctx->bc->stack.push >= 0);
7741		break;
7742	case FC_PUSH_WQM:
7743		--ctx->bc->stack.push_wqm;
7744		assert(ctx->bc->stack.push_wqm >= 0);
7745		break;
7746	case FC_LOOP:
7747		--ctx->bc->stack.loop;
7748		assert(ctx->bc->stack.loop >= 0);
7749		break;
7750	default:
7751		assert(0);
7752		break;
7753	}
7754}
7755
7756static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
7757{
7758	switch (reason) {
7759	case FC_PUSH_VPM:
7760		++ctx->bc->stack.push;
7761		break;
7762	case FC_PUSH_WQM:
7763		++ctx->bc->stack.push_wqm;
7764	case FC_LOOP:
7765		++ctx->bc->stack.loop;
7766		break;
7767	default:
7768		assert(0);
7769	}
7770
7771	callstack_update_max_depth(ctx, reason);
7772}
7773
7774static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
7775{
7776	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
7777
7778	sp->mid = realloc((void *)sp->mid,
7779						sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
7780	sp->mid[sp->num_mid] = ctx->bc->cf_last;
7781	sp->num_mid++;
7782}
7783
7784static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
7785{
7786	ctx->bc->fc_sp++;
7787	ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
7788	ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
7789}
7790
7791static void fc_poplevel(struct r600_shader_ctx *ctx)
7792{
7793	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
7794	free(sp->mid);
7795	sp->mid = NULL;
7796	sp->num_mid = 0;
7797	sp->start = NULL;
7798	sp->type = 0;
7799	ctx->bc->fc_sp--;
7800}
7801
7802#if 0
7803static int emit_return(struct r600_shader_ctx *ctx)
7804{
7805	r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
7806	return 0;
7807}
7808
7809static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
7810{
7811
7812	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
7813	ctx->bc->cf_last->pop_count = pops;
7814	/* XXX work out offset */
7815	return 0;
7816}
7817
7818static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
7819{
7820	return 0;
7821}
7822
7823static void emit_testflag(struct r600_shader_ctx *ctx)
7824{
7825
7826}
7827
7828static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
7829{
7830	emit_testflag(ctx);
7831	emit_jump_to_offset(ctx, 1, 4);
7832	emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
7833	pops(ctx, ifidx + 1);
7834	emit_return(ctx);
7835}
7836
7837static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
7838{
7839	emit_testflag(ctx);
7840
7841	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
7842	ctx->bc->cf_last->pop_count = 1;
7843
7844	fc_set_mid(ctx, fc_sp);
7845
7846	pops(ctx, 1);
7847}
7848#endif
7849
7850static int emit_if(struct r600_shader_ctx *ctx, int opcode)
7851{
7852	int alu_type = CF_OP_ALU_PUSH_BEFORE;
7853
7854	/* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
7855	 * LOOP_STARTxxx for nested loops may put the branch stack into a state
7856	 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
7857	 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
7858	if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) {
7859		r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
7860		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
7861		alu_type = CF_OP_ALU;
7862	}
7863
7864	emit_logic_pred(ctx, opcode, alu_type);
7865
7866	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
7867
7868	fc_pushlevel(ctx, FC_IF);
7869
7870	callstack_push(ctx, FC_PUSH_VPM);
7871	return 0;
7872}
7873
7874static int tgsi_if(struct r600_shader_ctx *ctx)
7875{
7876	return emit_if(ctx, ALU_OP2_PRED_SETNE);
7877}
7878
7879static int tgsi_uif(struct r600_shader_ctx *ctx)
7880{
7881	return emit_if(ctx, ALU_OP2_PRED_SETNE_INT);
7882}
7883
7884static int tgsi_else(struct r600_shader_ctx *ctx)
7885{
7886	r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
7887	ctx->bc->cf_last->pop_count = 1;
7888
7889	fc_set_mid(ctx, ctx->bc->fc_sp);
7890	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
7891	return 0;
7892}
7893
7894static int tgsi_endif(struct r600_shader_ctx *ctx)
7895{
7896	pops(ctx, 1);
7897	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
7898		R600_ERR("if/endif unbalanced in shader\n");
7899		return -1;
7900	}
7901
7902	if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
7903		ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
7904		ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
7905	} else {
7906		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
7907	}
7908	fc_poplevel(ctx);
7909
7910	callstack_pop(ctx, FC_PUSH_VPM);
7911	return 0;
7912}
7913
7914static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
7915{
7916	/* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
7917	 * limited to 4096 iterations, like the other LOOP_* instructions. */
7918	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
7919
7920	fc_pushlevel(ctx, FC_LOOP);
7921
7922	/* check stack depth */
7923	callstack_push(ctx, FC_LOOP);
7924	return 0;
7925}
7926
7927static int tgsi_endloop(struct r600_shader_ctx *ctx)
7928{
7929	int i;
7930
7931	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
7932
7933	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
7934		R600_ERR("loop/endloop in shader code are not paired.\n");
7935		return -EINVAL;
7936	}
7937
7938	/* fixup loop pointers - from r600isa
7939	   LOOP END points to CF after LOOP START,
7940	   LOOP START point to CF after LOOP END
7941	   BRK/CONT point to LOOP END CF
7942	*/
7943	ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
7944
7945	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
7946
7947	for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
7948		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
7949	}
7950	/* XXX add LOOPRET support */
7951	fc_poplevel(ctx);
7952	callstack_pop(ctx, FC_LOOP);
7953	return 0;
7954}
7955
7956static int tgsi_loop_breakc(struct r600_shader_ctx *ctx)
7957{
7958	int r;
7959	unsigned int fscp;
7960
7961	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
7962	{
7963		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
7964			break;
7965	}
7966	if (fscp == 0) {
7967		R600_ERR("BREAKC not inside loop/endloop pair\n");
7968		return -EINVAL;
7969	}
7970
7971	if (ctx->bc->chip_class == EVERGREEN &&
7972	    ctx->bc->family != CHIP_CYPRESS &&
7973	    ctx->bc->family != CHIP_JUNIPER) {
7974		/* HW bug: ALU_BREAK does not save the active mask correctly */
7975		r = tgsi_uif(ctx);
7976		if (r)
7977			return r;
7978
7979		r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_BREAK);
7980		if (r)
7981			return r;
7982		fc_set_mid(ctx, fscp);
7983
7984		return tgsi_endif(ctx);
7985	} else {
7986		r = emit_logic_pred(ctx, ALU_OP2_PRED_SETE_INT, CF_OP_ALU_BREAK);
7987		if (r)
7988			return r;
7989		fc_set_mid(ctx, fscp);
7990	}
7991
7992	return 0;
7993}
7994
7995static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
7996{
7997	unsigned int fscp;
7998
7999	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
8000	{
8001		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
8002			break;
8003	}
8004
8005	if (fscp == 0) {
8006		R600_ERR("Break not inside loop/endloop pair\n");
8007		return -EINVAL;
8008	}
8009
8010	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
8011
8012	fc_set_mid(ctx, fscp);
8013
8014	return 0;
8015}
8016
8017static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
8018{
8019	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8020	int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
8021	int r;
8022
8023	if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
8024		emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
8025
8026	r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
8027	if (!r) {
8028		ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
8029		return emit_inc_ring_offset(ctx, stream, TRUE);
8030	}
8031	return r;
8032}
8033
8034static int tgsi_umad(struct r600_shader_ctx *ctx)
8035{
8036	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8037	struct r600_bytecode_alu alu;
8038	int i, j, k, r;
8039	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8040
8041	/* src0 * src1 */
8042	for (i = 0; i < lasti + 1; i++) {
8043		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8044			continue;
8045
8046		if (ctx->bc->chip_class == CAYMAN) {
8047			for (j = 0 ; j < 4; j++) {
8048				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8049
8050				alu.op = ALU_OP2_MULLO_UINT;
8051				for (k = 0; k < inst->Instruction.NumSrcRegs; k++) {
8052					r600_bytecode_src(&alu.src[k], &ctx->src[k], i);
8053				}
8054				alu.dst.chan = j;
8055				alu.dst.sel = ctx->temp_reg;
8056				alu.dst.write = (j == i);
8057				if (j == 3)
8058					alu.last = 1;
8059				r = r600_bytecode_add_alu(ctx->bc, &alu);
8060				if (r)
8061					return r;
8062			}
8063		} else {
8064			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8065
8066			alu.dst.chan = i;
8067			alu.dst.sel = ctx->temp_reg;
8068			alu.dst.write = 1;
8069
8070			alu.op = ALU_OP2_MULLO_UINT;
8071			for (j = 0; j < 2; j++) {
8072				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
8073			}
8074
8075			alu.last = 1;
8076			r = r600_bytecode_add_alu(ctx->bc, &alu);
8077			if (r)
8078				return r;
8079		}
8080	}
8081
8082
8083	for (i = 0; i < lasti + 1; i++) {
8084		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8085			continue;
8086
8087		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8088		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8089
8090		alu.op = ALU_OP2_ADD_INT;
8091
8092		alu.src[0].sel = ctx->temp_reg;
8093		alu.src[0].chan = i;
8094
8095		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
8096		if (i == lasti) {
8097			alu.last = 1;
8098		}
8099		r = r600_bytecode_add_alu(ctx->bc, &alu);
8100		if (r)
8101			return r;
8102	}
8103	return 0;
8104}
8105
8106static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
8107	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_r600_arl},
8108	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
8109	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
8110
8111	/* XXX:
8112	 * For state trackers other than OpenGL, we'll want to use
8113	 * _RECIP_IEEE instead.
8114	 */
8115	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_CLAMPED, tgsi_trans_srcx_replicate},
8116
8117	[TGSI_OPCODE_RSQ]	= { ALU_OP0_NOP, tgsi_rsq},
8118	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
8119	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
8120	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL, tgsi_op2},
8121	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
8122	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4, tgsi_dp},
8123	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4, tgsi_dp},
8124	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
8125	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN, tgsi_op2},
8126	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX, tgsi_op2},
8127	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
8128	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
8129	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD, tgsi_op3},
8130	[TGSI_OPCODE_SUB]	= { ALU_OP2_ADD, tgsi_op2},
8131	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
8132	[TGSI_OPCODE_FMA]	= { ALU_OP0_NOP, tgsi_unsupported},
8133	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
8134	[TGSI_OPCODE_DP2A]	= { ALU_OP0_NOP, tgsi_unsupported},
8135	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
8136	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
8137	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
8138	[TGSI_OPCODE_CLAMP]	= { ALU_OP0_NOP, tgsi_unsupported},
8139	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
8140	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
8141	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
8142	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
8143	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
8144	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
8145	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
8146	[TGSI_OPCODE_ABS]	= { ALU_OP1_MOV, tgsi_op2},
8147	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
8148	[TGSI_OPCODE_DPH]	= { ALU_OP2_DOT4, tgsi_dp},
8149	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
8150	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8151	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8152	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
8153	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_unsupported},
8154	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
8155	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
8156	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
8157	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
8158	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
8159	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
8160	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
8161	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, tgsi_trig},
8162	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
8163	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
8164	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
8165	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
8166	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
8167	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
8168	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_unsupported},
8169	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
8170	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
8171	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
8172	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
8173	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
8174	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_r600_arl},
8175	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
8176	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
8177	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
8178	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
8179	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
8180	[TGSI_OPCODE_SCS]	= { ALU_OP0_NOP, tgsi_scs},
8181	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
8182	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
8183	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
8184	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4, tgsi_dp},
8185	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
8186	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
8187	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
8188	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
8189	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
8190	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
8191	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
8192	[TGSI_OPCODE_DDX_FINE]	= { ALU_OP0_NOP, tgsi_unsupported},
8193	[TGSI_OPCODE_DDY_FINE]	= { ALU_OP0_NOP, tgsi_unsupported},
8194	[TGSI_OPCODE_PUSHA]	= { ALU_OP0_NOP, tgsi_unsupported},
8195	[TGSI_OPCODE_POPA]	= { ALU_OP0_NOP, tgsi_unsupported},
8196	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
8197	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
8198	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
8199	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
8200	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2_trans},
8201	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
8202	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
8203	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
8204	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
8205	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
8206	[TGSI_OPCODE_SAD]	= { ALU_OP0_NOP, tgsi_unsupported},
8207	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
8208	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8209	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
8210	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
8211	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
8212	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
8213	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
8214	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
8215	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
8216	[TGSI_OPCODE_TXQ_LZ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8217	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
8218	[105]			= { ALU_OP0_NOP, tgsi_unsupported},
8219	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
8220	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
8221	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
8222	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
8223	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
8224	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
8225	[112]			= { ALU_OP0_NOP, tgsi_unsupported},
8226	[TGSI_OPCODE_CALLNZ]	= { ALU_OP0_NOP, tgsi_unsupported},
8227	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
8228	[TGSI_OPCODE_BREAKC]	= { ALU_OP0_NOP, tgsi_loop_breakc},
8229	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
8230	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
8231	[118]			= { ALU_OP0_NOP, tgsi_unsupported},
8232	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
8233	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
8234	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
8235	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
8236	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
8237	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
8238	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2_trans},
8239	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
8240	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
8241	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
8242	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
8243	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
8244	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
8245	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
8246	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
8247	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
8248	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
8249	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
8250	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
8251	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2_trans},
8252	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
8253	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2_swap},
8254	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
8255	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
8256	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
8257	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
8258	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
8259	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
8260	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
8261	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
8262	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
8263	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
8264	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
8265	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
8266	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
8267	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
8268	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
8269	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
8270	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_r600_arl},
8271	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
8272	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
8273	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
8274	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
8275	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
8276	[TGSI_OPCODE_MFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8277	[TGSI_OPCODE_LFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8278	[TGSI_OPCODE_SFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8279	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_NOP, tgsi_unsupported},
8280	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
8281	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
8282	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
8283	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
8284	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
8285	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
8286	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
8287	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
8288	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
8289	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
8290	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
8291	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
8292	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
8293	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, tgsi_op2_trans},
8294	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
8295	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_unsupported},
8296	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_unsupported},
8297	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_unsupported},
8298	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_unsupported},
8299	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_unsupported},
8300	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_unsupported},
8301	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_unsupported},
8302	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_unsupported},
8303	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_unsupported},
8304	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_unsupported},
8305	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_unsupported},
8306	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_unsupported},
8307	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_unsupported},
8308	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
8309};
8310
8311static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
8312	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_eg_arl},
8313	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
8314	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
8315	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
8316	[TGSI_OPCODE_RSQ]	= { ALU_OP1_RECIPSQRT_IEEE, tgsi_rsq},
8317	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
8318	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
8319	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL, tgsi_op2},
8320	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
8321	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4, tgsi_dp},
8322	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4, tgsi_dp},
8323	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
8324	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN, tgsi_op2},
8325	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX, tgsi_op2},
8326	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
8327	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
8328	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD, tgsi_op3},
8329	[TGSI_OPCODE_SUB]	= { ALU_OP2_ADD, tgsi_op2},
8330	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
8331	[TGSI_OPCODE_FMA]	= { ALU_OP0_NOP, tgsi_unsupported},
8332	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
8333	[TGSI_OPCODE_DP2A]	= { ALU_OP0_NOP, tgsi_unsupported},
8334	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
8335	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
8336	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
8337	[TGSI_OPCODE_CLAMP]	= { ALU_OP0_NOP, tgsi_unsupported},
8338	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
8339	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
8340	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
8341	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
8342	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
8343	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
8344	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
8345	[TGSI_OPCODE_ABS]	= { ALU_OP1_MOV, tgsi_op2},
8346	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
8347	[TGSI_OPCODE_DPH]	= { ALU_OP2_DOT4, tgsi_dp},
8348	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
8349	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8350	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8351	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
8352	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_unsupported},
8353	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
8354	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
8355	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
8356	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
8357	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
8358	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
8359	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
8360	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, tgsi_trig},
8361	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
8362	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
8363	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
8364	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
8365	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
8366	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
8367	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_unsupported},
8368	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
8369	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
8370	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
8371	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
8372	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
8373	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_eg_arl},
8374	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
8375	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
8376	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
8377	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
8378	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
8379	[TGSI_OPCODE_SCS]	= { ALU_OP0_NOP, tgsi_scs},
8380	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
8381	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
8382	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
8383	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4, tgsi_dp},
8384	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
8385	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
8386	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
8387	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
8388	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
8389	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
8390	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
8391	[TGSI_OPCODE_DDX_FINE]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8392	[TGSI_OPCODE_DDY_FINE]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8393	[TGSI_OPCODE_PUSHA]	= { ALU_OP0_NOP, tgsi_unsupported},
8394	[TGSI_OPCODE_POPA]	= { ALU_OP0_NOP, tgsi_unsupported},
8395	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
8396	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
8397	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
8398	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
8399	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2},
8400	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
8401	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
8402	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
8403	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
8404	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
8405	[TGSI_OPCODE_SAD]	= { ALU_OP0_NOP, tgsi_unsupported},
8406	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
8407	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8408	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
8409	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
8410	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
8411	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
8412	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
8413	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
8414	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
8415	[TGSI_OPCODE_TXQ_LZ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8416	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
8417	[105]			= { ALU_OP0_NOP, tgsi_unsupported},
8418	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
8419	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
8420	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
8421	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
8422	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
8423	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
8424	[112]			= { ALU_OP0_NOP, tgsi_unsupported},
8425	[TGSI_OPCODE_CALLNZ]	= { ALU_OP0_NOP, tgsi_unsupported},
8426	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
8427	[TGSI_OPCODE_BREAKC]	= { ALU_OP0_NOP, tgsi_unsupported},
8428	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
8429	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
8430	[118]			= { ALU_OP0_NOP, tgsi_unsupported},
8431	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_f2i},
8432	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
8433	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
8434	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
8435	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
8436	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
8437	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2},
8438	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
8439	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
8440	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
8441	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
8442	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
8443	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
8444	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
8445	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
8446	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
8447	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
8448	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
8449	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
8450	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2},
8451	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
8452	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2},
8453	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
8454	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
8455	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
8456	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
8457	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
8458	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
8459	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
8460	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
8461	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
8462	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
8463	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
8464	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
8465	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
8466	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
8467	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
8468	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
8469	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_eg_arl},
8470	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
8471	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
8472	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
8473	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
8474	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
8475	[TGSI_OPCODE_MFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8476	[TGSI_OPCODE_LFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8477	[TGSI_OPCODE_SFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8478	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_NOP, tgsi_unsupported},
8479	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
8480	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
8481	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
8482	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
8483	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
8484	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
8485	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
8486	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
8487	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
8488	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
8489	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
8490	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
8491	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
8492	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, tgsi_op2_trans},
8493	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
8494	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_tex},
8495	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_tex},
8496	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_op3},
8497	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_op3},
8498	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_bfi},
8499	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_op2},
8500	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_op2},
8501	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_op2},
8502	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_msb},
8503	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_msb},
8504	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
8505	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
8506	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
8507	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
8508	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
8509	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
8510	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
8511	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
8512	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
8513	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
8514	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
8515	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
8516	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
8517	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
8518	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
8519	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
8520	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
8521	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
8522	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
8523	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
8524	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
8525	[TGSI_OPCODE_D2I]	= { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
8526	[TGSI_OPCODE_I2D]	= { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
8527	[TGSI_OPCODE_D2U]	= { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
8528	[TGSI_OPCODE_U2D]	= { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
8529	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
8530	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
8531};
8532
8533static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
8534	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_eg_arl},
8535	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
8536	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
8537	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
8538	[TGSI_OPCODE_RSQ]	= { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
8539	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
8540	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
8541	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL, tgsi_op2},
8542	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
8543	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4, tgsi_dp},
8544	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4, tgsi_dp},
8545	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
8546	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN, tgsi_op2},
8547	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX, tgsi_op2},
8548	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
8549	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
8550	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD, tgsi_op3},
8551	[TGSI_OPCODE_SUB]	= { ALU_OP2_ADD, tgsi_op2},
8552	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
8553	[TGSI_OPCODE_FMA]	= { ALU_OP0_NOP, tgsi_unsupported},
8554	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
8555	[TGSI_OPCODE_DP2A]	= { ALU_OP0_NOP, tgsi_unsupported},
8556	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
8557	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
8558	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
8559	[TGSI_OPCODE_CLAMP]	= { ALU_OP0_NOP, tgsi_unsupported},
8560	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
8561	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
8562	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
8563	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
8564	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, cayman_pow},
8565	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
8566	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
8567	[TGSI_OPCODE_ABS]	= { ALU_OP1_MOV, tgsi_op2},
8568	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
8569	[TGSI_OPCODE_DPH]	= { ALU_OP2_DOT4, tgsi_dp},
8570	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, cayman_trig},
8571	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8572	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8573	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
8574	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_unsupported},
8575	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
8576	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
8577	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
8578	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
8579	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
8580	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
8581	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
8582	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, cayman_trig},
8583	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
8584	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
8585	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
8586	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
8587	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
8588	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
8589	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_unsupported},
8590	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
8591	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
8592	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
8593	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
8594	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
8595	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_eg_arl},
8596	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
8597	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
8598	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
8599	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
8600	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
8601	[TGSI_OPCODE_SCS]	= { ALU_OP0_NOP, tgsi_scs},
8602	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
8603	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
8604	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
8605	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4, tgsi_dp},
8606	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
8607	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
8608	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
8609	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
8610	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
8611	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
8612	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
8613	[TGSI_OPCODE_DDX_FINE]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8614	[TGSI_OPCODE_DDY_FINE]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8615	[TGSI_OPCODE_PUSHA]	= { ALU_OP0_NOP, tgsi_unsupported},
8616	[TGSI_OPCODE_POPA]	= { ALU_OP0_NOP, tgsi_unsupported},
8617	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
8618	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2},
8619	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
8620	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
8621	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2},
8622	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
8623	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
8624	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
8625	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
8626	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
8627	[TGSI_OPCODE_SAD]	= { ALU_OP0_NOP, tgsi_unsupported},
8628	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
8629	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8630	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
8631	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
8632	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
8633	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
8634	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
8635	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
8636	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
8637	[TGSI_OPCODE_TXQ_LZ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8638	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
8639	[105]			= { ALU_OP0_NOP, tgsi_unsupported},
8640	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
8641	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
8642	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
8643	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
8644	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
8645	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
8646	[112]			= { ALU_OP0_NOP, tgsi_unsupported},
8647	[TGSI_OPCODE_CALLNZ]	= { ALU_OP0_NOP, tgsi_unsupported},
8648	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
8649	[TGSI_OPCODE_BREAKC]	= { ALU_OP0_NOP, tgsi_unsupported},
8650	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
8651	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
8652	[118]			= { ALU_OP0_NOP, tgsi_unsupported},
8653	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_op2},
8654	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
8655	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
8656	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
8657	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
8658	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
8659	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2},
8660	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
8661	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_op2},
8662	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2},
8663	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
8664	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
8665	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
8666	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
8667	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
8668	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
8669	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
8670	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
8671	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
8672	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2},
8673	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
8674	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2},
8675	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
8676	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
8677	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
8678	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
8679	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
8680	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
8681	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
8682	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
8683	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
8684	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
8685	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
8686	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
8687	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
8688	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
8689	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
8690	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
8691	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_eg_arl},
8692	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
8693	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
8694	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
8695	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
8696	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
8697	[TGSI_OPCODE_MFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8698	[TGSI_OPCODE_LFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8699	[TGSI_OPCODE_SFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
8700	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_NOP, tgsi_unsupported},
8701	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
8702	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
8703	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
8704	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
8705	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
8706	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
8707	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
8708	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
8709	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
8710	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
8711	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
8712	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
8713	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
8714	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
8715	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
8716	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_tex},
8717	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_tex},
8718	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_op3},
8719	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_op3},
8720	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_bfi},
8721	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_op2},
8722	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_op2},
8723	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_op2},
8724	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_msb},
8725	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_msb},
8726	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
8727	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
8728	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
8729	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
8730	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
8731	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
8732	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
8733	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
8734	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
8735	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
8736	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
8737	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
8738	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
8739	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
8740	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
8741	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
8742	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
8743	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
8744	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
8745	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
8746	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
8747	[TGSI_OPCODE_D2I]	= { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
8748	[TGSI_OPCODE_I2D]	= { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
8749	[TGSI_OPCODE_D2U]	= { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
8750	[TGSI_OPCODE_U2D]	= { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
8751	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
8752	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
8753};
8754