r600_shader.c revision 9662a43d23c0ae46b4294561476b57e22e76ae04
1/*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23#include "r600_sq.h"
24#include "r600_llvm.h"
25#include "r600_formats.h"
26#include "r600_opcodes.h"
27#include "r600_shader.h"
28#include "r600d.h"
29
30#include "sb/sb_public.h"
31
32#include "pipe/p_shader_tokens.h"
33#include "tgsi/tgsi_info.h"
34#include "tgsi/tgsi_parse.h"
35#include "tgsi/tgsi_scan.h"
36#include "tgsi/tgsi_dump.h"
37#include "util/u_memory.h"
38#include "util/u_math.h"
39#include <stdio.h>
40#include <errno.h>
41
42/* CAYMAN notes
43Why CAYMAN got loops for lots of instructions is explained here.
44
45-These 8xx t-slot only ops are implemented in all vector slots.
46MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
47These 8xx t-slot only opcodes become vector ops, with all four
48slots expecting the arguments on sources a and b. Result is
49broadcast to all channels.
50MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
51These 8xx t-slot only opcodes become vector ops in the z, y, and
52x slots.
53EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
54RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
55SQRT_IEEE/_64
56SIN/COS
57The w slot may have an independent co-issued operation, or if the
58result is required to be in the w slot, the opcode above may be
59issued in the w slot as well.
60The compiler must issue the source argument to slots z, y, and x
61*/
62
63/* Contents of r0 on entry to various shaders
64
65 VS - .x = VertexID
66      .y = RelVertexID (??)
67      .w = InstanceID
68
69 GS - r0.xyw, r1.xyz = per-vertex offsets
70      r0.z = PrimitiveID
71
72 TCS - .x = PatchID
73       .y = RelPatchID (??)
74       .z = InvocationID
75       .w = tess factor base.
76
77 TES - .x = TessCoord.x
78     - .y = TessCoord.y
79     - .z = RelPatchID (??)
80     - .w = PrimitiveID
81
82 PS - face_gpr.z = SampleMask
83      face_gpr.w = SampleID
84*/
85#define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
86static int r600_shader_from_tgsi(struct r600_context *rctx,
87				 struct r600_pipe_shader *pipeshader,
88				 union r600_shader_key key);
89
90static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
91                           int size, unsigned comp_mask) {
92
93	if (!size)
94		return;
95
96	if (ps->num_arrays == ps->max_arrays) {
97		ps->max_arrays += 64;
98		ps->arrays = realloc(ps->arrays, ps->max_arrays *
99		                     sizeof(struct r600_shader_array));
100	}
101
102	int n = ps->num_arrays;
103	++ps->num_arrays;
104
105	ps->arrays[n].comp_mask = comp_mask;
106	ps->arrays[n].gpr_start = start_gpr;
107	ps->arrays[n].gpr_count = size;
108}
109
110static void r600_dump_streamout(struct pipe_stream_output_info *so)
111{
112	unsigned i;
113
114	fprintf(stderr, "STREAMOUT\n");
115	for (i = 0; i < so->num_outputs; i++) {
116		unsigned mask = ((1 << so->output[i].num_components) - 1) <<
117				so->output[i].start_component;
118		fprintf(stderr, "  %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
119			i,
120			so->output[i].stream,
121			so->output[i].output_buffer,
122			so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
123			so->output[i].register_index,
124			mask & 1 ? "x" : "",
125		        mask & 2 ? "y" : "",
126		        mask & 4 ? "z" : "",
127		        mask & 8 ? "w" : "",
128			so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
129	}
130}
131
132static int store_shader(struct pipe_context *ctx,
133			struct r600_pipe_shader *shader)
134{
135	struct r600_context *rctx = (struct r600_context *)ctx;
136	uint32_t *ptr, i;
137
138	if (shader->bo == NULL) {
139		shader->bo = (struct r600_resource*)
140			pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
141		if (shader->bo == NULL) {
142			return -ENOMEM;
143		}
144		ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE);
145		if (R600_BIG_ENDIAN) {
146			for (i = 0; i < shader->shader.bc.ndw; ++i) {
147				ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
148			}
149		} else {
150			memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
151		}
152		rctx->b.ws->buffer_unmap(shader->bo->cs_buf);
153	}
154
155	return 0;
156}
157
158int r600_pipe_shader_create(struct pipe_context *ctx,
159			    struct r600_pipe_shader *shader,
160			    union r600_shader_key key)
161{
162	struct r600_context *rctx = (struct r600_context *)ctx;
163	struct r600_pipe_shader_selector *sel = shader->selector;
164	int r;
165	bool dump = r600_can_dump_shader(&rctx->screen->b, sel->tokens);
166	unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
167	unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
168	unsigned export_shader;
169
170	shader->shader.bc.isa = rctx->isa;
171
172	if (dump) {
173		fprintf(stderr, "--------------------------------------------------------------\n");
174		tgsi_dump(sel->tokens, 0);
175
176		if (sel->so.num_outputs) {
177			r600_dump_streamout(&sel->so);
178		}
179	}
180	r = r600_shader_from_tgsi(rctx, shader, key);
181	if (r) {
182		R600_ERR("translation from TGSI failed !\n");
183		goto error;
184	}
185	if (shader->shader.processor_type == TGSI_PROCESSOR_VERTEX) {
186		/* only disable for vertex shaders in tess paths */
187		if (key.vs.as_ls)
188			use_sb = 0;
189	}
190	use_sb &= (shader->shader.processor_type != TGSI_PROCESSOR_TESS_CTRL);
191	use_sb &= (shader->shader.processor_type != TGSI_PROCESSOR_TESS_EVAL);
192
193	/* disable SB for shaders using doubles */
194	use_sb &= !shader->shader.uses_doubles;
195
196	/* Check if the bytecode has already been built.  When using the llvm
197	 * backend, r600_shader_from_tgsi() will take care of building the
198	 * bytecode.
199	 */
200	if (!shader->shader.bc.bytecode) {
201		r = r600_bytecode_build(&shader->shader.bc);
202		if (r) {
203			R600_ERR("building bytecode failed !\n");
204			goto error;
205		}
206	}
207
208	if (dump && !sb_disasm) {
209		fprintf(stderr, "--------------------------------------------------------------\n");
210		r600_bytecode_disasm(&shader->shader.bc);
211		fprintf(stderr, "______________________________________________________________\n");
212	} else if ((dump && sb_disasm) || use_sb) {
213		r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
214		                             dump, use_sb);
215		if (r) {
216			R600_ERR("r600_sb_bytecode_process failed !\n");
217			goto error;
218		}
219	}
220
221	if (shader->gs_copy_shader) {
222		if (dump) {
223			// dump copy shader
224			r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
225						     &shader->gs_copy_shader->shader, dump, 0);
226			if (r)
227				goto error;
228		}
229
230		if ((r = store_shader(ctx, shader->gs_copy_shader)))
231			goto error;
232	}
233
234	/* Store the shader in a buffer. */
235	if ((r = store_shader(ctx, shader)))
236		goto error;
237
238	/* Build state. */
239	switch (shader->shader.processor_type) {
240	case TGSI_PROCESSOR_TESS_CTRL:
241		evergreen_update_hs_state(ctx, shader);
242		break;
243	case TGSI_PROCESSOR_TESS_EVAL:
244		if (key.tes.as_es)
245			evergreen_update_es_state(ctx, shader);
246		else
247			evergreen_update_vs_state(ctx, shader);
248		break;
249	case TGSI_PROCESSOR_GEOMETRY:
250		if (rctx->b.chip_class >= EVERGREEN) {
251			evergreen_update_gs_state(ctx, shader);
252			evergreen_update_vs_state(ctx, shader->gs_copy_shader);
253		} else {
254			r600_update_gs_state(ctx, shader);
255			r600_update_vs_state(ctx, shader->gs_copy_shader);
256		}
257		break;
258	case TGSI_PROCESSOR_VERTEX:
259		export_shader = key.vs.as_es;
260		if (rctx->b.chip_class >= EVERGREEN) {
261			if (key.vs.as_ls)
262				evergreen_update_ls_state(ctx, shader);
263			else if (key.vs.as_es)
264				evergreen_update_es_state(ctx, shader);
265			else
266				evergreen_update_vs_state(ctx, shader);
267		} else {
268			if (export_shader)
269				r600_update_es_state(ctx, shader);
270			else
271				r600_update_vs_state(ctx, shader);
272		}
273		break;
274	case TGSI_PROCESSOR_FRAGMENT:
275		if (rctx->b.chip_class >= EVERGREEN) {
276			evergreen_update_ps_state(ctx, shader);
277		} else {
278			r600_update_ps_state(ctx, shader);
279		}
280		break;
281	default:
282		r = -EINVAL;
283		goto error;
284	}
285	return 0;
286
287error:
288	r600_pipe_shader_destroy(ctx, shader);
289	return r;
290}
291
292void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
293{
294	pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL);
295	r600_bytecode_clear(&shader->shader.bc);
296	r600_release_command_buffer(&shader->command_buffer);
297}
298
299/*
300 * tgsi -> r600 shader
301 */
302struct r600_shader_tgsi_instruction;
303
304struct r600_shader_src {
305	unsigned				sel;
306	unsigned				swizzle[4];
307	unsigned				neg;
308	unsigned				abs;
309	unsigned				rel;
310	unsigned				kc_bank;
311	boolean					kc_rel; /* true if cache bank is indexed */
312	uint32_t				value[4];
313};
314
315struct eg_interp {
316	boolean					enabled;
317	unsigned				ij_index;
318};
319
320struct r600_shader_ctx {
321	struct tgsi_shader_info			info;
322	struct tgsi_parse_context		parse;
323	const struct tgsi_token			*tokens;
324	unsigned				type;
325	unsigned				file_offset[TGSI_FILE_COUNT];
326	unsigned				temp_reg;
327	const struct r600_shader_tgsi_instruction	*inst_info;
328	struct r600_bytecode			*bc;
329	struct r600_shader			*shader;
330	struct r600_shader_src			src[4];
331	uint32_t				*literals;
332	uint32_t				nliterals;
333	uint32_t				max_driver_temp_used;
334	boolean use_llvm;
335	/* needed for evergreen interpolation */
336	struct eg_interp		eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
337	/* evergreen/cayman also store sample mask in face register */
338	int					face_gpr;
339	/* sample id is .w component stored in fixed point position register */
340	int					fixed_pt_position_gpr;
341	int					colors_used;
342	boolean                 clip_vertex_write;
343	unsigned                cv_output;
344	unsigned		edgeflag_output;
345	int					fragcoord_input;
346	int					native_integers;
347	int					next_ring_offset;
348	int					gs_out_ring_offset;
349	int					gs_next_vertex;
350	struct r600_shader	*gs_for_vs;
351	int					gs_export_gpr_tregs[4];
352	const struct pipe_stream_output_info	*gs_stream_output_info;
353	unsigned				enabled_stream_buffers_mask;
354	unsigned                                tess_input_info; /* temp with tess input offsets */
355	unsigned                                tess_output_info; /* temp with tess input offsets */
356};
357
358struct r600_shader_tgsi_instruction {
359	unsigned	op;
360	int (*process)(struct r600_shader_ctx *ctx);
361};
362
363static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
364static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
365static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
366static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
367static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
368static int tgsi_else(struct r600_shader_ctx *ctx);
369static int tgsi_endif(struct r600_shader_ctx *ctx);
370static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
371static int tgsi_endloop(struct r600_shader_ctx *ctx);
372static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
373static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
374                                unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
375                                unsigned int dst_reg);
376static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
377			const struct r600_shader_src *shader_src,
378			unsigned chan);
379static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
380			       unsigned dst_reg);
381
382static int tgsi_last_instruction(unsigned writemask)
383{
384	int i, lasti = 0;
385
386	for (i = 0; i < 4; i++) {
387		if (writemask & (1 << i)) {
388			lasti = i;
389		}
390	}
391	return lasti;
392}
393
394static int tgsi_is_supported(struct r600_shader_ctx *ctx)
395{
396	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
397	int j;
398
399	if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
400		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
401		return -EINVAL;
402	}
403	if (i->Instruction.Predicate) {
404		R600_ERR("predicate unsupported\n");
405		return -EINVAL;
406	}
407#if 0
408	if (i->Instruction.Label) {
409		R600_ERR("label unsupported\n");
410		return -EINVAL;
411	}
412#endif
413	for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
414		if (i->Src[j].Register.Dimension) {
415		   switch (i->Src[j].Register.File) {
416		   case TGSI_FILE_CONSTANT:
417			   break;
418		   case TGSI_FILE_INPUT:
419			   if (ctx->type == TGSI_PROCESSOR_GEOMETRY ||
420			       ctx->type == TGSI_PROCESSOR_TESS_CTRL ||
421			       ctx->type == TGSI_PROCESSOR_TESS_EVAL)
422				   break;
423		   case TGSI_FILE_OUTPUT:
424			   if (ctx->type == TGSI_PROCESSOR_TESS_CTRL)
425				   break;
426		   default:
427			   R600_ERR("unsupported src %d (file %d, dimension %d)\n", j,
428				    i->Src[j].Register.File,
429				    i->Src[j].Register.Dimension);
430			   return -EINVAL;
431		   }
432		}
433	}
434	for (j = 0; j < i->Instruction.NumDstRegs; j++) {
435		if (i->Dst[j].Register.Dimension) {
436			if (ctx->type == TGSI_PROCESSOR_TESS_CTRL)
437				continue;
438			R600_ERR("unsupported dst (dimension)\n");
439			return -EINVAL;
440		}
441	}
442	return 0;
443}
444
445int eg_get_interpolator_index(unsigned interpolate, unsigned location)
446{
447	if (interpolate == TGSI_INTERPOLATE_COLOR ||
448		interpolate == TGSI_INTERPOLATE_LINEAR ||
449		interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
450	{
451		int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
452		int loc;
453
454		switch(location) {
455		case TGSI_INTERPOLATE_LOC_CENTER:
456			loc = 1;
457			break;
458		case TGSI_INTERPOLATE_LOC_CENTROID:
459			loc = 2;
460			break;
461		case TGSI_INTERPOLATE_LOC_SAMPLE:
462		default:
463			loc = 0; break;
464		}
465
466		return is_linear * 3 + loc;
467	}
468
469	return -1;
470}
471
472static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
473		int input)
474{
475	int i = eg_get_interpolator_index(
476		ctx->shader->input[input].interpolate,
477		ctx->shader->input[input].interpolate_location);
478	assert(i >= 0);
479	ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
480}
481
482static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
483{
484	int i, r;
485	struct r600_bytecode_alu alu;
486	int gpr = 0, base_chan = 0;
487	int ij_index = ctx->shader->input[input].ij_index;
488
489	/* work out gpr and base_chan from index */
490	gpr = ij_index / 2;
491	base_chan = (2 * (ij_index % 2)) + 1;
492
493	for (i = 0; i < 8; i++) {
494		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
495
496		if (i < 4)
497			alu.op = ALU_OP2_INTERP_ZW;
498		else
499			alu.op = ALU_OP2_INTERP_XY;
500
501		if ((i > 1) && (i < 6)) {
502			alu.dst.sel = ctx->shader->input[input].gpr;
503			alu.dst.write = 1;
504		}
505
506		alu.dst.chan = i % 4;
507
508		alu.src[0].sel = gpr;
509		alu.src[0].chan = (base_chan - (i % 2));
510
511		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
512
513		alu.bank_swizzle_force = SQ_ALU_VEC_210;
514		if ((i % 4) == 3)
515			alu.last = 1;
516		r = r600_bytecode_add_alu(ctx->bc, &alu);
517		if (r)
518			return r;
519	}
520	return 0;
521}
522
523static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
524{
525	int i, r;
526	struct r600_bytecode_alu alu;
527
528	for (i = 0; i < 4; i++) {
529		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
530
531		alu.op = ALU_OP1_INTERP_LOAD_P0;
532
533		alu.dst.sel = ctx->shader->input[input].gpr;
534		alu.dst.write = 1;
535
536		alu.dst.chan = i;
537
538		alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
539		alu.src[0].chan = i;
540
541		if (i == 3)
542			alu.last = 1;
543		r = r600_bytecode_add_alu(ctx->bc, &alu);
544		if (r)
545			return r;
546	}
547	return 0;
548}
549
550/*
551 * Special export handling in shaders
552 *
553 * shader export ARRAY_BASE for EXPORT_POS:
554 * 60 is position
555 * 61 is misc vector
556 * 62, 63 are clip distance vectors
557 *
558 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
559 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
560 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
561 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
562 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
563 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
564 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
565 * exclusive from render target index)
566 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
567 *
568 *
569 * shader export ARRAY_BASE for EXPORT_PIXEL:
570 * 0-7 CB targets
571 * 61 computed Z vector
572 *
573 * The use of the values exported in the computed Z vector are controlled
574 * by DB_SHADER_CONTROL:
575 * Z_EXPORT_ENABLE - Z as a float in RED
576 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
577 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
578 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
579 * DB_SOURCE_FORMAT - export control restrictions
580 *
581 */
582
583
584/* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
585static int r600_spi_sid(struct r600_shader_io * io)
586{
587	int index, name = io->name;
588
589	/* These params are handled differently, they don't need
590	 * semantic indices, so we'll use 0 for them.
591	 */
592	if (name == TGSI_SEMANTIC_POSITION ||
593	    name == TGSI_SEMANTIC_PSIZE ||
594	    name == TGSI_SEMANTIC_EDGEFLAG ||
595	    name == TGSI_SEMANTIC_FACE ||
596	    name == TGSI_SEMANTIC_SAMPLEMASK)
597		index = 0;
598	else {
599		if (name == TGSI_SEMANTIC_GENERIC) {
600			/* For generic params simply use sid from tgsi */
601			index = io->sid;
602		} else {
603			/* For non-generic params - pack name and sid into 8 bits */
604			index = 0x80 | (name<<3) | (io->sid);
605		}
606
607		/* Make sure that all really used indices have nonzero value, so
608		 * we can just compare it to 0 later instead of comparing the name
609		 * with different values to detect special cases. */
610		index++;
611	}
612
613	return index;
614};
615
616/* we need this to get a common lds index for vs/tcs/tes input/outputs */
617int r600_get_lds_unique_index(unsigned semantic_name, unsigned index)
618{
619	switch (semantic_name) {
620	case TGSI_SEMANTIC_POSITION:
621		return 0;
622	case TGSI_SEMANTIC_PSIZE:
623		return 1;
624	case TGSI_SEMANTIC_CLIPDIST:
625		assert(index <= 1);
626		return 2 + index;
627	case TGSI_SEMANTIC_GENERIC:
628		if (index <= 63-4)
629			return 4 + index - 9;
630		else
631			/* same explanation as in the default statement,
632			 * the only user hitting this is st/nine.
633			 */
634			return 0;
635
636	/* patch indices are completely separate and thus start from 0 */
637	case TGSI_SEMANTIC_TESSOUTER:
638		return 0;
639	case TGSI_SEMANTIC_TESSINNER:
640		return 1;
641	case TGSI_SEMANTIC_PATCH:
642		return 2 + index;
643
644	default:
645		/* Don't fail here. The result of this function is only used
646		 * for LS, TCS, TES, and GS, where legacy GL semantics can't
647		 * occur, but this function is called for all vertex shaders
648		 * before it's known whether LS will be compiled or not.
649		 */
650		return 0;
651	}
652}
653
654/* turn input into interpolate on EG */
655static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
656{
657	int r = 0;
658
659	if (ctx->shader->input[index].spi_sid) {
660		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
661		if (ctx->shader->input[index].interpolate > 0) {
662			evergreen_interp_assign_ij_index(ctx, index);
663			if (!ctx->use_llvm)
664				r = evergreen_interp_alu(ctx, index);
665		} else {
666			if (!ctx->use_llvm)
667				r = evergreen_interp_flat(ctx, index);
668		}
669	}
670	return r;
671}
672
673static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
674{
675	struct r600_bytecode_alu alu;
676	int i, r;
677	int gpr_front = ctx->shader->input[front].gpr;
678	int gpr_back = ctx->shader->input[back].gpr;
679
680	for (i = 0; i < 4; i++) {
681		memset(&alu, 0, sizeof(alu));
682		alu.op = ALU_OP3_CNDGT;
683		alu.is_op3 = 1;
684		alu.dst.write = 1;
685		alu.dst.sel = gpr_front;
686		alu.src[0].sel = ctx->face_gpr;
687		alu.src[1].sel = gpr_front;
688		alu.src[2].sel = gpr_back;
689
690		alu.dst.chan = i;
691		alu.src[1].chan = i;
692		alu.src[2].chan = i;
693		alu.last = (i==3);
694
695		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
696			return r;
697	}
698
699	return 0;
700}
701
702/* execute a single slot ALU calculation */
703static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
704			  int dst_sel, int dst_chan,
705			  int src0_sel, unsigned src0_chan_val,
706			  int src1_sel, unsigned src1_chan_val)
707{
708	struct r600_bytecode_alu alu;
709	int r, i;
710
711	if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) {
712		for (i = 0; i < 4; i++) {
713			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
714			alu.op = op;
715			alu.src[0].sel = src0_sel;
716			if (src0_sel == V_SQ_ALU_SRC_LITERAL)
717				alu.src[0].value = src0_chan_val;
718			else
719				alu.src[0].chan = src0_chan_val;
720			alu.src[1].sel = src1_sel;
721			if (src1_sel == V_SQ_ALU_SRC_LITERAL)
722				alu.src[1].value = src1_chan_val;
723			else
724				alu.src[1].chan = src1_chan_val;
725			alu.dst.sel = dst_sel;
726			alu.dst.chan = i;
727			alu.dst.write = i == dst_chan;
728			alu.last = (i == 3);
729			r = r600_bytecode_add_alu(ctx->bc, &alu);
730			if (r)
731				return r;
732		}
733		return 0;
734	}
735
736	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
737	alu.op = op;
738	alu.src[0].sel = src0_sel;
739	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
740		alu.src[0].value = src0_chan_val;
741	else
742		alu.src[0].chan = src0_chan_val;
743	alu.src[1].sel = src1_sel;
744	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
745		alu.src[1].value = src1_chan_val;
746	else
747		alu.src[1].chan = src1_chan_val;
748	alu.dst.sel = dst_sel;
749	alu.dst.chan = dst_chan;
750	alu.dst.write = 1;
751	alu.last = 1;
752	r = r600_bytecode_add_alu(ctx->bc, &alu);
753	if (r)
754		return r;
755	return 0;
756}
757
758/* execute a single slot ALU calculation */
759static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
760			  int dst_sel, int dst_chan,
761			  int src0_sel, unsigned src0_chan_val,
762			  int src1_sel, unsigned src1_chan_val,
763			  int src2_sel, unsigned src2_chan_val)
764{
765	struct r600_bytecode_alu alu;
766	int r;
767
768	/* validate this for other ops */
769	assert(op == ALU_OP3_MULADD_UINT24);
770	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
771	alu.op = op;
772	alu.src[0].sel = src0_sel;
773	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
774		alu.src[0].value = src0_chan_val;
775	else
776		alu.src[0].chan = src0_chan_val;
777	alu.src[1].sel = src1_sel;
778	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
779		alu.src[1].value = src1_chan_val;
780	else
781		alu.src[1].chan = src1_chan_val;
782	alu.src[2].sel = src2_sel;
783	if (src2_sel == V_SQ_ALU_SRC_LITERAL)
784		alu.src[2].value = src2_chan_val;
785	else
786		alu.src[2].chan = src2_chan_val;
787	alu.dst.sel = dst_sel;
788	alu.dst.chan = dst_chan;
789	alu.is_op3 = 1;
790	alu.last = 1;
791	r = r600_bytecode_add_alu(ctx->bc, &alu);
792	if (r)
793		return r;
794	return 0;
795}
796
797/* put it in temp_reg.x */
798static int get_lds_offset0(struct r600_shader_ctx *ctx,
799			   int rel_patch_chan,
800			   int temp_reg, bool is_patch_var)
801{
802	int r;
803
804	/* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */
805	/* ADD
806	   Dimension - patch0_offset (input_vals.z),
807	   Non-dim - patch0_data_offset (input_vals.w)
808	*/
809	r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
810			   temp_reg, 0,
811			   ctx->tess_output_info, 0,
812			   0, rel_patch_chan,
813			   ctx->tess_output_info, is_patch_var ? 3 : 2);
814	if (r)
815		return r;
816	return 0;
817}
818
819static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
820{
821	return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
822}
823
824static int r600_get_temp(struct r600_shader_ctx *ctx)
825{
826	return ctx->temp_reg + ctx->max_driver_temp_used++;
827}
828
829static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
830{
831	int i;
832	i = ctx->shader->noutput++;
833	ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
834	ctx->shader->output[i].sid = 0;
835	ctx->shader->output[i].gpr = 0;
836	ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
837	ctx->shader->output[i].write_mask = 0x4;
838	ctx->shader->output[i].spi_sid = prim_id_sid;
839
840	return 0;
841}
842
843static int tgsi_declaration(struct r600_shader_ctx *ctx)
844{
845	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
846	int r, i, j, count = d->Range.Last - d->Range.First + 1;
847
848	switch (d->Declaration.File) {
849	case TGSI_FILE_INPUT:
850		for (j = 0; j < count; j++) {
851			i = ctx->shader->ninput + j;
852			assert(i < Elements(ctx->shader->input));
853			ctx->shader->input[i].name = d->Semantic.Name;
854			ctx->shader->input[i].sid = d->Semantic.Index + j;
855			ctx->shader->input[i].interpolate = d->Interp.Interpolate;
856			ctx->shader->input[i].interpolate_location = d->Interp.Location;
857			ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
858			if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
859				ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
860				switch (ctx->shader->input[i].name) {
861				case TGSI_SEMANTIC_FACE:
862					if (ctx->face_gpr != -1)
863						ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
864					else
865						ctx->face_gpr = ctx->shader->input[i].gpr;
866					break;
867				case TGSI_SEMANTIC_COLOR:
868					ctx->colors_used++;
869					break;
870				case TGSI_SEMANTIC_POSITION:
871					ctx->fragcoord_input = i;
872					break;
873				case TGSI_SEMANTIC_PRIMID:
874					/* set this for now */
875					ctx->shader->gs_prim_id_input = true;
876					ctx->shader->ps_prim_id_input = i;
877					break;
878				}
879				if (ctx->bc->chip_class >= EVERGREEN) {
880					if ((r = evergreen_interp_input(ctx, i)))
881						return r;
882				}
883			} else if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
884				/* FIXME probably skip inputs if they aren't passed in the ring */
885				ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
886				ctx->next_ring_offset += 16;
887				if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
888					ctx->shader->gs_prim_id_input = true;
889			}
890		}
891		ctx->shader->ninput += count;
892		break;
893	case TGSI_FILE_OUTPUT:
894		for (j = 0; j < count; j++) {
895			i = ctx->shader->noutput + j;
896			assert(i < Elements(ctx->shader->output));
897			ctx->shader->output[i].name = d->Semantic.Name;
898			ctx->shader->output[i].sid = d->Semantic.Index + j;
899			ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
900			ctx->shader->output[i].interpolate = d->Interp.Interpolate;
901			ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
902			if (ctx->type == TGSI_PROCESSOR_VERTEX ||
903			    ctx->type == TGSI_PROCESSOR_GEOMETRY ||
904			    ctx->type == TGSI_PROCESSOR_TESS_EVAL) {
905				ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
906				switch (d->Semantic.Name) {
907				case TGSI_SEMANTIC_CLIPDIST:
908					ctx->shader->clip_dist_write |= d->Declaration.UsageMask <<
909									((d->Semantic.Index + j) << 2);
910					break;
911				case TGSI_SEMANTIC_PSIZE:
912					ctx->shader->vs_out_misc_write = 1;
913					ctx->shader->vs_out_point_size = 1;
914					break;
915				case TGSI_SEMANTIC_EDGEFLAG:
916					ctx->shader->vs_out_misc_write = 1;
917					ctx->shader->vs_out_edgeflag = 1;
918					ctx->edgeflag_output = i;
919					break;
920				case TGSI_SEMANTIC_VIEWPORT_INDEX:
921					ctx->shader->vs_out_misc_write = 1;
922					ctx->shader->vs_out_viewport = 1;
923					break;
924				case TGSI_SEMANTIC_LAYER:
925					ctx->shader->vs_out_misc_write = 1;
926					ctx->shader->vs_out_layer = 1;
927					break;
928				case TGSI_SEMANTIC_CLIPVERTEX:
929					ctx->clip_vertex_write = TRUE;
930					ctx->cv_output = i;
931					break;
932				}
933				if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
934					ctx->gs_out_ring_offset += 16;
935				}
936			} else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
937				switch (d->Semantic.Name) {
938				case TGSI_SEMANTIC_COLOR:
939					ctx->shader->nr_ps_max_color_exports++;
940					break;
941				}
942			}
943		}
944		ctx->shader->noutput += count;
945		break;
946	case TGSI_FILE_TEMPORARY:
947		if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
948			if (d->Array.ArrayID) {
949				r600_add_gpr_array(ctx->shader,
950				               ctx->file_offset[TGSI_FILE_TEMPORARY] +
951								   d->Range.First,
952				               d->Range.Last - d->Range.First + 1, 0x0F);
953			}
954		}
955		break;
956
957	case TGSI_FILE_CONSTANT:
958	case TGSI_FILE_SAMPLER:
959	case TGSI_FILE_SAMPLER_VIEW:
960	case TGSI_FILE_ADDRESS:
961		break;
962
963	case TGSI_FILE_SYSTEM_VALUE:
964		if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
965			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
966			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
967			break; /* Already handled from allocate_system_value_inputs */
968		} else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
969			if (!ctx->native_integers) {
970				struct r600_bytecode_alu alu;
971				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
972
973				alu.op = ALU_OP1_INT_TO_FLT;
974				alu.src[0].sel = 0;
975				alu.src[0].chan = 3;
976
977				alu.dst.sel = 0;
978				alu.dst.chan = 3;
979				alu.dst.write = 1;
980				alu.last = 1;
981
982				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
983					return r;
984			}
985			break;
986		} else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
987			break;
988		else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
989			break;
990		else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ||
991			 d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) {
992			int param = r600_get_lds_unique_index(d->Semantic.Name, 0);
993			int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2;
994			unsigned temp_reg = r600_get_temp(ctx);
995
996			r = get_lds_offset0(ctx, 2, temp_reg, true);
997			if (r)
998				return r;
999
1000			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1001					   temp_reg, 0,
1002					   temp_reg, 0,
1003					   V_SQ_ALU_SRC_LITERAL, param * 16);
1004			if (r)
1005				return r;
1006
1007			do_lds_fetch_values(ctx, temp_reg, dreg);
1008		}
1009		else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) {
1010			/* MOV r1.x, r0.x;
1011			   MOV r1.y, r0.y;
1012			*/
1013			for (i = 0; i < 2; i++) {
1014				struct r600_bytecode_alu alu;
1015				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1016				alu.op = ALU_OP1_MOV;
1017				alu.src[0].sel = 0;
1018				alu.src[0].chan = 0 + i;
1019				alu.dst.sel = 1;
1020				alu.dst.chan = 0 + i;
1021				alu.dst.write = 1;
1022				alu.last = (i == 1) ? 1 : 0;
1023				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1024					return r;
1025			}
1026			/* ADD r1.z, 1.0f, -r0.x */
1027			struct r600_bytecode_alu alu;
1028			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1029			alu.op = ALU_OP2_ADD;
1030			alu.src[0].sel = V_SQ_ALU_SRC_1;
1031			alu.src[1].sel = 1;
1032			alu.src[1].chan = 0;
1033			alu.src[1].neg = 1;
1034			alu.dst.sel = 1;
1035			alu.dst.chan = 2;
1036			alu.dst.write = 1;
1037			alu.last = 1;
1038			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1039				return r;
1040
1041			/* ADD r1.z, r1.z, -r1.y */
1042			alu.op = ALU_OP2_ADD;
1043			alu.src[0].sel = 1;
1044			alu.src[0].chan = 2;
1045			alu.src[1].sel = 1;
1046			alu.src[1].chan = 1;
1047			alu.src[1].neg = 1;
1048			alu.dst.sel = 1;
1049			alu.dst.chan = 2;
1050			alu.dst.write = 1;
1051			alu.last = 1;
1052			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1053				return r;
1054			break;
1055		}
1056		break;
1057	default:
1058		R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
1059		return -EINVAL;
1060	}
1061	return 0;
1062}
1063
1064static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
1065{
1066	struct tgsi_parse_context parse;
1067	struct {
1068		boolean enabled;
1069		int *reg;
1070		unsigned name, alternate_name;
1071	} inputs[2] = {
1072		{ false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
1073
1074		{ false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
1075	};
1076	int i, k, num_regs = 0;
1077
1078	if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1079		return 0;
1080	}
1081
1082	/* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1083	while (!tgsi_parse_end_of_tokens(&parse)) {
1084		tgsi_parse_token(&parse);
1085
1086		if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1087			const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1088			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1089				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1090				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1091			{
1092				int interpolate, location, k;
1093
1094				if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1095					location = TGSI_INTERPOLATE_LOC_CENTER;
1096					inputs[1].enabled = true; /* needs SAMPLEID */
1097				} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1098					location = TGSI_INTERPOLATE_LOC_CENTER;
1099					/* Needs sample positions, currently those are always available */
1100				} else {
1101					location = TGSI_INTERPOLATE_LOC_CENTROID;
1102				}
1103
1104				interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1105				k = eg_get_interpolator_index(interpolate, location);
1106				ctx->eg_interpolators[k].enabled = true;
1107			}
1108		} else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
1109			struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
1110			if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1111				for (k = 0; k < Elements(inputs); k++) {
1112					if (d->Semantic.Name == inputs[k].name ||
1113						d->Semantic.Name == inputs[k].alternate_name) {
1114						inputs[k].enabled = true;
1115					}
1116				}
1117			}
1118		}
1119	}
1120
1121	tgsi_parse_free(&parse);
1122
1123	for (i = 0; i < Elements(inputs); i++) {
1124		boolean enabled = inputs[i].enabled;
1125		int *reg = inputs[i].reg;
1126		unsigned name = inputs[i].name;
1127
1128		if (enabled) {
1129			int gpr = gpr_offset + num_regs++;
1130
1131			// add to inputs, allocate a gpr
1132			k = ctx->shader->ninput ++;
1133			ctx->shader->input[k].name = name;
1134			ctx->shader->input[k].sid = 0;
1135			ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
1136			ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
1137			*reg = ctx->shader->input[k].gpr = gpr;
1138		}
1139	}
1140
1141	return gpr_offset + num_regs;
1142}
1143
1144/*
1145 * for evergreen we need to scan the shader to find the number of GPRs we need to
1146 * reserve for interpolation and system values
1147 *
1148 * we need to know if we are going to emit
1149 * any sample or centroid inputs
1150 * if perspective and linear are required
1151*/
1152static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
1153{
1154	int i;
1155	int num_baryc;
1156	struct tgsi_parse_context parse;
1157
1158	memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
1159
1160	for (i = 0; i < ctx->info.num_inputs; i++) {
1161		int k;
1162		/* skip position/face/mask/sampleid */
1163		if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
1164		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
1165		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
1166		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
1167			continue;
1168
1169		k = eg_get_interpolator_index(
1170			ctx->info.input_interpolate[i],
1171			ctx->info.input_interpolate_loc[i]);
1172		if (k >= 0)
1173			ctx->eg_interpolators[k].enabled = TRUE;
1174	}
1175
1176	if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1177		return 0;
1178	}
1179
1180	/* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1181	while (!tgsi_parse_end_of_tokens(&parse)) {
1182		tgsi_parse_token(&parse);
1183
1184		if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1185			const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1186			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1187				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1188				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1189			{
1190				int interpolate, location, k;
1191
1192				if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1193					location = TGSI_INTERPOLATE_LOC_CENTER;
1194				} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1195					location = TGSI_INTERPOLATE_LOC_CENTER;
1196				} else {
1197					location = TGSI_INTERPOLATE_LOC_CENTROID;
1198				}
1199
1200				interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1201				k = eg_get_interpolator_index(interpolate, location);
1202				ctx->eg_interpolators[k].enabled = true;
1203			}
1204		}
1205	}
1206
1207	tgsi_parse_free(&parse);
1208
1209	/* assign gpr to each interpolator according to priority */
1210	num_baryc = 0;
1211	for (i = 0; i < Elements(ctx->eg_interpolators); i++) {
1212		if (ctx->eg_interpolators[i].enabled) {
1213			ctx->eg_interpolators[i].ij_index = num_baryc;
1214			num_baryc ++;
1215		}
1216	}
1217
1218	/* XXX PULL MODEL and LINE STIPPLE */
1219
1220	num_baryc = (num_baryc + 1) >> 1;
1221	return allocate_system_value_inputs(ctx, num_baryc);
1222}
1223
1224/* sample_id_sel == NULL means fetch for current sample */
1225static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
1226{
1227	struct r600_bytecode_vtx vtx;
1228	int r, t1;
1229
1230	assert(ctx->fixed_pt_position_gpr != -1);
1231
1232	t1 = r600_get_temp(ctx);
1233
1234	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1235	vtx.op = FETCH_OP_VFETCH;
1236	vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1237	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1238	if (sample_id == NULL) {
1239		vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
1240		vtx.src_sel_x = 3;
1241	}
1242	else {
1243		struct r600_bytecode_alu alu;
1244
1245		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1246		alu.op = ALU_OP1_MOV;
1247		r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
1248		alu.dst.sel = t1;
1249		alu.dst.write = 1;
1250		alu.last = 1;
1251		r = r600_bytecode_add_alu(ctx->bc, &alu);
1252		if (r)
1253			return r;
1254
1255		vtx.src_gpr = t1;
1256		vtx.src_sel_x = 0;
1257	}
1258	vtx.mega_fetch_count = 16;
1259	vtx.dst_gpr = t1;
1260	vtx.dst_sel_x = 0;
1261	vtx.dst_sel_y = 1;
1262	vtx.dst_sel_z = 2;
1263	vtx.dst_sel_w = 3;
1264	vtx.data_format = FMT_32_32_32_32_FLOAT;
1265	vtx.num_format_all = 2;
1266	vtx.format_comp_all = 1;
1267	vtx.use_const_fields = 0;
1268	vtx.offset = 1; // first element is size of buffer
1269	vtx.endian = r600_endian_swap(32);
1270	vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1271
1272	r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1273	if (r)
1274		return r;
1275
1276	return t1;
1277}
1278
1279static void tgsi_src(struct r600_shader_ctx *ctx,
1280		     const struct tgsi_full_src_register *tgsi_src,
1281		     struct r600_shader_src *r600_src)
1282{
1283	memset(r600_src, 0, sizeof(*r600_src));
1284	r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
1285	r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1286	r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1287	r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1288	r600_src->neg = tgsi_src->Register.Negate;
1289	r600_src->abs = tgsi_src->Register.Absolute;
1290
1291	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1292		int index;
1293		if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1294			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1295			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1296
1297			index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1298			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs);
1299			if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1300				return;
1301		}
1302		index = tgsi_src->Register.Index;
1303		r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1304		memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1305	} else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1306		if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1307			r600_src->swizzle[0] = 2; // Z value
1308			r600_src->swizzle[1] = 2;
1309			r600_src->swizzle[2] = 2;
1310			r600_src->swizzle[3] = 2;
1311			r600_src->sel = ctx->face_gpr;
1312		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1313			r600_src->swizzle[0] = 3; // W value
1314			r600_src->swizzle[1] = 3;
1315			r600_src->swizzle[2] = 3;
1316			r600_src->swizzle[3] = 3;
1317			r600_src->sel = ctx->fixed_pt_position_gpr;
1318		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1319			r600_src->swizzle[0] = 0;
1320			r600_src->swizzle[1] = 1;
1321			r600_src->swizzle[2] = 4;
1322			r600_src->swizzle[3] = 4;
1323			r600_src->sel = load_sample_position(ctx, NULL, -1);
1324		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1325			r600_src->swizzle[0] = 3;
1326			r600_src->swizzle[1] = 3;
1327			r600_src->swizzle[2] = 3;
1328			r600_src->swizzle[3] = 3;
1329			r600_src->sel = 0;
1330		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1331			r600_src->swizzle[0] = 0;
1332			r600_src->swizzle[1] = 0;
1333			r600_src->swizzle[2] = 0;
1334			r600_src->swizzle[3] = 0;
1335			r600_src->sel = 0;
1336		} else if (ctx->type != TGSI_PROCESSOR_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1337			r600_src->swizzle[0] = 3;
1338			r600_src->swizzle[1] = 3;
1339			r600_src->swizzle[2] = 3;
1340			r600_src->swizzle[3] = 3;
1341			r600_src->sel = 1;
1342		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1343			r600_src->swizzle[0] = 2;
1344			r600_src->swizzle[1] = 2;
1345			r600_src->swizzle[2] = 2;
1346			r600_src->swizzle[3] = 2;
1347			r600_src->sel = 0;
1348		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) {
1349			r600_src->sel = 1;
1350		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) {
1351			r600_src->sel = 3;
1352		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) {
1353			r600_src->sel = 2;
1354		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) {
1355			if (ctx->type == TGSI_PROCESSOR_TESS_CTRL) {
1356				r600_src->sel = ctx->tess_input_info;
1357				r600_src->swizzle[0] = 2;
1358				r600_src->swizzle[1] = 2;
1359				r600_src->swizzle[2] = 2;
1360				r600_src->swizzle[3] = 2;
1361			} else {
1362				r600_src->sel = ctx->tess_input_info;
1363				r600_src->swizzle[0] = 3;
1364				r600_src->swizzle[1] = 3;
1365				r600_src->swizzle[2] = 3;
1366				r600_src->swizzle[3] = 3;
1367			}
1368		} else if (ctx->type == TGSI_PROCESSOR_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1369			r600_src->sel = 0;
1370			r600_src->swizzle[0] = 0;
1371			r600_src->swizzle[1] = 0;
1372			r600_src->swizzle[2] = 0;
1373			r600_src->swizzle[3] = 0;
1374		} else if (ctx->type == TGSI_PROCESSOR_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1375			r600_src->sel = 0;
1376			r600_src->swizzle[0] = 3;
1377			r600_src->swizzle[1] = 3;
1378			r600_src->swizzle[2] = 3;
1379			r600_src->swizzle[3] = 3;
1380		}
1381	} else {
1382		if (tgsi_src->Register.Indirect)
1383			r600_src->rel = V_SQ_REL_RELATIVE;
1384		r600_src->sel = tgsi_src->Register.Index;
1385		r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1386	}
1387	if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1388		if (tgsi_src->Register.Dimension) {
1389			r600_src->kc_bank = tgsi_src->Dimension.Index;
1390			if (tgsi_src->Dimension.Indirect) {
1391				r600_src->kc_rel = 1;
1392			}
1393		}
1394	}
1395}
1396
1397static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1398                                unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1399                                unsigned int dst_reg)
1400{
1401	struct r600_bytecode_vtx vtx;
1402	unsigned int ar_reg;
1403	int r;
1404
1405	if (offset) {
1406		struct r600_bytecode_alu alu;
1407
1408		memset(&alu, 0, sizeof(alu));
1409
1410		alu.op = ALU_OP2_ADD_INT;
1411		alu.src[0].sel = ctx->bc->ar_reg;
1412		alu.src[0].chan = ar_chan;
1413
1414		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1415		alu.src[1].value = offset;
1416
1417		alu.dst.sel = dst_reg;
1418		alu.dst.chan = ar_chan;
1419		alu.dst.write = 1;
1420		alu.last = 1;
1421
1422		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1423			return r;
1424
1425		ar_reg = dst_reg;
1426	} else {
1427		ar_reg = ctx->bc->ar_reg;
1428	}
1429
1430	memset(&vtx, 0, sizeof(vtx));
1431	vtx.buffer_id = cb_idx;
1432	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1433	vtx.src_gpr = ar_reg;
1434	vtx.src_sel_x = ar_chan;
1435	vtx.mega_fetch_count = 16;
1436	vtx.dst_gpr = dst_reg;
1437	vtx.dst_sel_x = 0;		/* SEL_X */
1438	vtx.dst_sel_y = 1;		/* SEL_Y */
1439	vtx.dst_sel_z = 2;		/* SEL_Z */
1440	vtx.dst_sel_w = 3;		/* SEL_W */
1441	vtx.data_format = FMT_32_32_32_32_FLOAT;
1442	vtx.num_format_all = 2;		/* NUM_FORMAT_SCALED */
1443	vtx.format_comp_all = 1;	/* FORMAT_COMP_SIGNED */
1444	vtx.endian = r600_endian_swap(32);
1445	vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1446
1447	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1448		return r;
1449
1450	return 0;
1451}
1452
1453static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1454{
1455	struct r600_bytecode_vtx vtx;
1456	int r;
1457	unsigned index = src->Register.Index;
1458	unsigned vtx_id = src->Dimension.Index;
1459	int offset_reg = vtx_id / 3;
1460	int offset_chan = vtx_id % 3;
1461
1462	/* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1463	 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1464
1465	if (offset_reg == 0 && offset_chan == 2)
1466		offset_chan = 3;
1467
1468	if (src->Dimension.Indirect) {
1469		int treg[3];
1470		int t2;
1471		struct r600_bytecode_alu alu;
1472		int r, i;
1473
1474		/* you have got to be shitting me -
1475		   we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1476		   at least this is what fglrx seems to do. */
1477		for (i = 0; i < 3; i++) {
1478			treg[i] = r600_get_temp(ctx);
1479		}
1480		r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
1481
1482		t2 = r600_get_temp(ctx);
1483		for (i = 0; i < 3; i++) {
1484			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1485			alu.op = ALU_OP1_MOV;
1486			alu.src[0].sel = 0;
1487			alu.src[0].chan = i == 2 ? 3 : i;
1488			alu.dst.sel = treg[i];
1489			alu.dst.chan = 0;
1490			alu.dst.write = 1;
1491			alu.last = 1;
1492			r = r600_bytecode_add_alu(ctx->bc, &alu);
1493			if (r)
1494				return r;
1495		}
1496		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1497		alu.op = ALU_OP1_MOV;
1498		alu.src[0].sel = treg[0];
1499		alu.src[0].rel = 1;
1500		alu.dst.sel = t2;
1501		alu.dst.write = 1;
1502		alu.last = 1;
1503		r = r600_bytecode_add_alu(ctx->bc, &alu);
1504		if (r)
1505			return r;
1506		offset_reg = t2;
1507	}
1508
1509
1510	memset(&vtx, 0, sizeof(vtx));
1511	vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1512	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1513	vtx.src_gpr = offset_reg;
1514	vtx.src_sel_x = offset_chan;
1515	vtx.offset = index * 16; /*bytes*/
1516	vtx.mega_fetch_count = 16;
1517	vtx.dst_gpr = dst_reg;
1518	vtx.dst_sel_x = 0;		/* SEL_X */
1519	vtx.dst_sel_y = 1;		/* SEL_Y */
1520	vtx.dst_sel_z = 2;		/* SEL_Z */
1521	vtx.dst_sel_w = 3;		/* SEL_W */
1522	if (ctx->bc->chip_class >= EVERGREEN) {
1523		vtx.use_const_fields = 1;
1524	} else {
1525		vtx.data_format = FMT_32_32_32_32_FLOAT;
1526	}
1527
1528	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1529		return r;
1530
1531	return 0;
1532}
1533
1534static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1535{
1536	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1537	int i;
1538
1539	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1540		struct tgsi_full_src_register *src = &inst->Src[i];
1541
1542		if (src->Register.File == TGSI_FILE_INPUT) {
1543			if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
1544				/* primitive id is in R0.z */
1545				ctx->src[i].sel = 0;
1546				ctx->src[i].swizzle[0] = 2;
1547			}
1548		}
1549		if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
1550			int treg = r600_get_temp(ctx);
1551
1552			fetch_gs_input(ctx, src, treg);
1553			ctx->src[i].sel = treg;
1554		}
1555	}
1556	return 0;
1557}
1558
1559
1560/* Tessellation shaders pass outputs to the next shader using LDS.
1561 *
1562 * LS outputs = TCS(HS) inputs
1563 * TCS(HS) outputs = TES(DS) inputs
1564 *
1565 * The LDS layout is:
1566 * - TCS inputs for patch 0
1567 * - TCS inputs for patch 1
1568 * - TCS inputs for patch 2		= get_tcs_in_current_patch_offset (if RelPatchID==2)
1569 * - ...
1570 * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
1571 * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
1572 * - TCS outputs for patch 1
1573 * - Per-patch TCS outputs for patch 1
1574 * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
1575 * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
1576 * - ...
1577 *
1578 * All three shaders VS(LS), TCS, TES share the same LDS space.
1579 */
1580/* this will return with the dw address in temp_reg.x */
1581static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
1582				 const struct tgsi_full_dst_register *dst,
1583				 const struct tgsi_full_src_register *src,
1584				 int stride_bytes_reg, int stride_bytes_chan)
1585{
1586	struct tgsi_full_dst_register reg;
1587	ubyte *name, *index, *array_first;
1588	int r;
1589	int param;
1590	struct tgsi_shader_info *info = &ctx->info;
1591	/* Set the register description. The address computation is the same
1592	 * for sources and destinations. */
1593	if (src) {
1594		reg.Register.File = src->Register.File;
1595		reg.Register.Index = src->Register.Index;
1596		reg.Register.Indirect = src->Register.Indirect;
1597		reg.Register.Dimension = src->Register.Dimension;
1598		reg.Indirect = src->Indirect;
1599		reg.Dimension = src->Dimension;
1600		reg.DimIndirect = src->DimIndirect;
1601	} else
1602		reg = *dst;
1603
1604	/* If the register is 2-dimensional (e.g. an array of vertices
1605	 * in a primitive), calculate the base address of the vertex. */
1606	if (reg.Register.Dimension) {
1607		int sel, chan;
1608		if (reg.Dimension.Indirect) {
1609			unsigned addr_reg;
1610			assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS);
1611
1612			addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index);
1613			/* pull the value from index_reg */
1614			sel = addr_reg;
1615			chan = 0;
1616		} else {
1617			sel = V_SQ_ALU_SRC_LITERAL;
1618			chan = reg.Dimension.Index;
1619		}
1620
1621		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1622				   temp_reg, 0,
1623				   stride_bytes_reg, stride_bytes_chan,
1624				   sel, chan,
1625				   temp_reg, 0);
1626		if (r)
1627			return r;
1628	}
1629
1630	if (reg.Register.File == TGSI_FILE_INPUT) {
1631		name = info->input_semantic_name;
1632		index = info->input_semantic_index;
1633		array_first = info->input_array_first;
1634	} else if (reg.Register.File == TGSI_FILE_OUTPUT) {
1635		name = info->output_semantic_name;
1636		index = info->output_semantic_index;
1637		array_first = info->output_array_first;
1638	} else {
1639		assert(0);
1640		return -1;
1641	}
1642	if (reg.Register.Indirect) {
1643		int addr_reg;
1644		int first;
1645		/* Add the relative address of the element. */
1646		if (reg.Indirect.ArrayID)
1647			first = array_first[reg.Indirect.ArrayID];
1648		else
1649			first = reg.Register.Index;
1650
1651		addr_reg = get_address_file_reg(ctx, reg.Indirect.Index);
1652
1653		/* pull the value from index_reg */
1654		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1655				   temp_reg, 0,
1656				   V_SQ_ALU_SRC_LITERAL, 16,
1657				   addr_reg, 0,
1658				   temp_reg, 0);
1659		if (r)
1660			return r;
1661
1662		param = r600_get_lds_unique_index(name[first],
1663						  index[first]);
1664
1665	} else {
1666		param = r600_get_lds_unique_index(name[reg.Register.Index],
1667						  index[reg.Register.Index]);
1668	}
1669
1670	/* add to base_addr - passed in temp_reg.x */
1671	if (param) {
1672		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1673				   temp_reg, 0,
1674				   temp_reg, 0,
1675				   V_SQ_ALU_SRC_LITERAL, param * 16);
1676		if (r)
1677			return r;
1678
1679	}
1680	return 0;
1681}
1682
1683static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
1684			       unsigned dst_reg)
1685{
1686	struct r600_bytecode_alu alu;
1687	int r, i;
1688
1689	if ((ctx->bc->cf_last->ndw>>1) >= 0x60)
1690		ctx->bc->force_add_cf = 1;
1691	for (i = 1; i < 4; i++) {
1692		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1693				   temp_reg, i,
1694				   temp_reg, 0,
1695				   V_SQ_ALU_SRC_LITERAL, 4 * i);
1696	}
1697	for (i = 0; i < 4; i++) {
1698		/* emit an LDS_READ_RET */
1699		memset(&alu, 0, sizeof(alu));
1700		alu.op = LDS_OP1_LDS_READ_RET;
1701		alu.src[0].sel = temp_reg;
1702		alu.src[0].chan = i;
1703		alu.src[1].sel = V_SQ_ALU_SRC_0;
1704		alu.src[2].sel = V_SQ_ALU_SRC_0;
1705		alu.dst.chan = 0;
1706		alu.is_lds_idx_op = true;
1707		alu.last = 1;
1708		r = r600_bytecode_add_alu(ctx->bc, &alu);
1709		if (r)
1710			return r;
1711	}
1712	for (i = 0; i < 4; i++) {
1713		/* then read from LDS_OQ_A_POP */
1714		memset(&alu, 0, sizeof(alu));
1715
1716		alu.op = ALU_OP1_MOV;
1717		alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
1718		alu.src[0].chan = 0;
1719		alu.dst.sel = dst_reg;
1720		alu.dst.chan = i;
1721		alu.dst.write = 1;
1722		alu.last = 1;
1723		r = r600_bytecode_add_alu(ctx->bc, &alu);
1724		if (r)
1725			return r;
1726	}
1727	return 0;
1728}
1729
1730static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1731{
1732	int r;
1733	unsigned temp_reg = r600_get_temp(ctx);
1734
1735	r = get_lds_offset0(ctx, 2, temp_reg,
1736			    src->Register.Dimension ? false : true);
1737	if (r)
1738		return r;
1739
1740	/* the base address is now in temp.x */
1741	r = r600_get_byte_address(ctx, temp_reg,
1742				  NULL, src, ctx->tess_output_info, 1);
1743	if (r)
1744		return r;
1745
1746	r = do_lds_fetch_values(ctx, temp_reg, dst_reg);
1747	if (r)
1748		return r;
1749	return 0;
1750}
1751
1752static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1753{
1754	int r;
1755	unsigned temp_reg = r600_get_temp(ctx);
1756
1757	/* t.x = ips * r0.y */
1758	r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
1759			   temp_reg, 0,
1760			   ctx->tess_input_info, 0,
1761			   0, 1);
1762
1763	if (r)
1764		return r;
1765
1766	/* the base address is now in temp.x */
1767	r = r600_get_byte_address(ctx, temp_reg,
1768				  NULL, src, ctx->tess_input_info, 1);
1769	if (r)
1770		return r;
1771
1772	r = do_lds_fetch_values(ctx, temp_reg, dst_reg);
1773	if (r)
1774		return r;
1775	return 0;
1776}
1777
1778static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1779{
1780	int r;
1781	unsigned temp_reg = r600_get_temp(ctx);
1782
1783	r = get_lds_offset0(ctx, 1, temp_reg,
1784			    src->Register.Dimension ? false : true);
1785	if (r)
1786		return r;
1787	/* the base address is now in temp.x */
1788	r = r600_get_byte_address(ctx, temp_reg,
1789				  NULL, src,
1790				  ctx->tess_output_info, 1);
1791	if (r)
1792		return r;
1793
1794	r = do_lds_fetch_values(ctx, temp_reg, dst_reg);
1795	if (r)
1796		return r;
1797	return 0;
1798}
1799
1800static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)
1801{
1802	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1803	int i;
1804
1805	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1806		struct tgsi_full_src_register *src = &inst->Src[i];
1807
1808		if (ctx->type == TGSI_PROCESSOR_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) {
1809			int treg = r600_get_temp(ctx);
1810			fetch_tes_input(ctx, src, treg);
1811			ctx->src[i].sel = treg;
1812			ctx->src[i].rel = 0;
1813		}
1814		if (ctx->type == TGSI_PROCESSOR_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) {
1815			int treg = r600_get_temp(ctx);
1816			fetch_tcs_input(ctx, src, treg);
1817			ctx->src[i].sel = treg;
1818			ctx->src[i].rel = 0;
1819		}
1820		if (ctx->type == TGSI_PROCESSOR_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) {
1821			int treg = r600_get_temp(ctx);
1822			fetch_tcs_output(ctx, src, treg);
1823			ctx->src[i].sel = treg;
1824			ctx->src[i].rel = 0;
1825		}
1826	}
1827	return 0;
1828}
1829
1830static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1831{
1832	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1833	struct r600_bytecode_alu alu;
1834	int i, j, k, nconst, r;
1835
1836	for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1837		if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1838			nconst++;
1839		}
1840		tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1841	}
1842	for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1843		if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1844			continue;
1845		}
1846
1847		if (ctx->src[i].rel) {
1848			int chan = inst->Src[i].Indirect.Swizzle;
1849			int treg = r600_get_temp(ctx);
1850			if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
1851				return r;
1852
1853			ctx->src[i].kc_bank = 0;
1854			ctx->src[i].kc_rel = 0;
1855			ctx->src[i].sel = treg;
1856			ctx->src[i].rel = 0;
1857			j--;
1858		} else if (j > 0) {
1859			int treg = r600_get_temp(ctx);
1860			for (k = 0; k < 4; k++) {
1861				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1862				alu.op = ALU_OP1_MOV;
1863				alu.src[0].sel = ctx->src[i].sel;
1864				alu.src[0].chan = k;
1865				alu.src[0].rel = ctx->src[i].rel;
1866				alu.src[0].kc_bank = ctx->src[i].kc_bank;
1867				alu.src[0].kc_rel = ctx->src[i].kc_rel;
1868				alu.dst.sel = treg;
1869				alu.dst.chan = k;
1870				alu.dst.write = 1;
1871				if (k == 3)
1872					alu.last = 1;
1873				r = r600_bytecode_add_alu(ctx->bc, &alu);
1874				if (r)
1875					return r;
1876			}
1877			ctx->src[i].sel = treg;
1878			ctx->src[i].rel =0;
1879			j--;
1880		}
1881	}
1882	return 0;
1883}
1884
1885/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
1886static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
1887{
1888	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1889	struct r600_bytecode_alu alu;
1890	int i, j, k, nliteral, r;
1891
1892	for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
1893		if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1894			nliteral++;
1895		}
1896	}
1897	for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
1898		if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1899			int treg = r600_get_temp(ctx);
1900			for (k = 0; k < 4; k++) {
1901				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1902				alu.op = ALU_OP1_MOV;
1903				alu.src[0].sel = ctx->src[i].sel;
1904				alu.src[0].chan = k;
1905				alu.src[0].value = ctx->src[i].value[k];
1906				alu.dst.sel = treg;
1907				alu.dst.chan = k;
1908				alu.dst.write = 1;
1909				if (k == 3)
1910					alu.last = 1;
1911				r = r600_bytecode_add_alu(ctx->bc, &alu);
1912				if (r)
1913					return r;
1914			}
1915			ctx->src[i].sel = treg;
1916			j--;
1917		}
1918	}
1919	return 0;
1920}
1921
1922static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
1923{
1924	int i, r, count = ctx->shader->ninput;
1925
1926	for (i = 0; i < count; i++) {
1927		if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1928			r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
1929			if (r)
1930				return r;
1931		}
1932	}
1933	return 0;
1934}
1935
1936static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
1937						  int stream, unsigned *stream_item_size)
1938{
1939	unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
1940	unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
1941	int i, j, r;
1942
1943	/* Sanity checking. */
1944	if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
1945		R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
1946		r = -EINVAL;
1947		goto out_err;
1948	}
1949	for (i = 0; i < so->num_outputs; i++) {
1950		if (so->output[i].output_buffer >= 4) {
1951			R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
1952				 so->output[i].output_buffer);
1953			r = -EINVAL;
1954			goto out_err;
1955		}
1956	}
1957
1958	/* Initialize locations where the outputs are stored. */
1959	for (i = 0; i < so->num_outputs; i++) {
1960
1961		so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
1962		start_comp[i] = so->output[i].start_component;
1963		/* Lower outputs with dst_offset < start_component.
1964		 *
1965		 * We can only output 4D vectors with a write mask, e.g. we can
1966		 * only output the W component at offset 3, etc. If we want
1967		 * to store Y, Z, or W at buffer offset 0, we need to use MOV
1968		 * to move it to X and output X. */
1969		if (so->output[i].dst_offset < so->output[i].start_component) {
1970			unsigned tmp = r600_get_temp(ctx);
1971
1972			for (j = 0; j < so->output[i].num_components; j++) {
1973				struct r600_bytecode_alu alu;
1974				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1975				alu.op = ALU_OP1_MOV;
1976				alu.src[0].sel = so_gpr[i];
1977				alu.src[0].chan = so->output[i].start_component + j;
1978
1979				alu.dst.sel = tmp;
1980				alu.dst.chan = j;
1981				alu.dst.write = 1;
1982				if (j == so->output[i].num_components - 1)
1983					alu.last = 1;
1984				r = r600_bytecode_add_alu(ctx->bc, &alu);
1985				if (r)
1986					return r;
1987			}
1988			start_comp[i] = 0;
1989			so_gpr[i] = tmp;
1990		}
1991	}
1992
1993	/* Write outputs to buffers. */
1994	for (i = 0; i < so->num_outputs; i++) {
1995		struct r600_bytecode_output output;
1996
1997		if (stream != -1 && stream != so->output[i].output_buffer)
1998			continue;
1999
2000		memset(&output, 0, sizeof(struct r600_bytecode_output));
2001		output.gpr = so_gpr[i];
2002		output.elem_size = so->output[i].num_components - 1;
2003		if (output.elem_size == 2)
2004			output.elem_size = 3; // 3 not supported, write 4 with junk at end
2005		output.array_base = so->output[i].dst_offset - start_comp[i];
2006		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2007		output.burst_count = 1;
2008		/* array_size is an upper limit for the burst_count
2009		 * with MEM_STREAM instructions */
2010		output.array_size = 0xFFF;
2011		output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
2012
2013		if (ctx->bc->chip_class >= EVERGREEN) {
2014			switch (so->output[i].output_buffer) {
2015			case 0:
2016				output.op = CF_OP_MEM_STREAM0_BUF0;
2017				break;
2018			case 1:
2019				output.op = CF_OP_MEM_STREAM0_BUF1;
2020				break;
2021			case 2:
2022				output.op = CF_OP_MEM_STREAM0_BUF2;
2023				break;
2024			case 3:
2025				output.op = CF_OP_MEM_STREAM0_BUF3;
2026				break;
2027			}
2028			output.op += so->output[i].stream * 4;
2029			assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
2030			ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
2031		} else {
2032			switch (so->output[i].output_buffer) {
2033			case 0:
2034				output.op = CF_OP_MEM_STREAM0;
2035				break;
2036			case 1:
2037				output.op = CF_OP_MEM_STREAM1;
2038				break;
2039			case 2:
2040				output.op = CF_OP_MEM_STREAM2;
2041				break;
2042			case 3:
2043				output.op = CF_OP_MEM_STREAM3;
2044					break;
2045			}
2046			ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
2047		}
2048		r = r600_bytecode_add_output(ctx->bc, &output);
2049		if (r)
2050			goto out_err;
2051	}
2052	return 0;
2053out_err:
2054	return r;
2055}
2056
2057static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
2058{
2059	struct r600_bytecode_alu alu;
2060	unsigned reg;
2061
2062	if (!ctx->shader->vs_out_edgeflag)
2063		return;
2064
2065	reg = ctx->shader->output[ctx->edgeflag_output].gpr;
2066
2067	/* clamp(x, 0, 1) */
2068	memset(&alu, 0, sizeof(alu));
2069	alu.op = ALU_OP1_MOV;
2070	alu.src[0].sel = reg;
2071	alu.dst.sel = reg;
2072	alu.dst.write = 1;
2073	alu.dst.clamp = 1;
2074	alu.last = 1;
2075	r600_bytecode_add_alu(ctx->bc, &alu);
2076
2077	memset(&alu, 0, sizeof(alu));
2078	alu.op = ALU_OP1_FLT_TO_INT;
2079	alu.src[0].sel = reg;
2080	alu.dst.sel = reg;
2081	alu.dst.write = 1;
2082	alu.last = 1;
2083	r600_bytecode_add_alu(ctx->bc, &alu);
2084}
2085
2086static int generate_gs_copy_shader(struct r600_context *rctx,
2087				   struct r600_pipe_shader *gs,
2088				   struct pipe_stream_output_info *so)
2089{
2090	struct r600_shader_ctx ctx = {};
2091	struct r600_shader *gs_shader = &gs->shader;
2092	struct r600_pipe_shader *cshader;
2093	int ocnt = gs_shader->noutput;
2094	struct r600_bytecode_alu alu;
2095	struct r600_bytecode_vtx vtx;
2096	struct r600_bytecode_output output;
2097	struct r600_bytecode_cf *cf_jump, *cf_pop,
2098		*last_exp_pos = NULL, *last_exp_param = NULL;
2099	int i, j, next_clip_pos = 61, next_param = 0;
2100	int ring;
2101
2102	cshader = calloc(1, sizeof(struct r600_pipe_shader));
2103	if (!cshader)
2104		return 0;
2105
2106	memcpy(cshader->shader.output, gs_shader->output, ocnt *
2107	       sizeof(struct r600_shader_io));
2108
2109	cshader->shader.noutput = ocnt;
2110
2111	ctx.shader = &cshader->shader;
2112	ctx.bc = &ctx.shader->bc;
2113	ctx.type = ctx.bc->type = TGSI_PROCESSOR_VERTEX;
2114
2115	r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
2116			   rctx->screen->has_compressed_msaa_texturing);
2117
2118	ctx.bc->isa = rctx->isa;
2119
2120	cf_jump = NULL;
2121	memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
2122
2123	/* R0.x = R0.x & 0x3fffffff */
2124	memset(&alu, 0, sizeof(alu));
2125	alu.op = ALU_OP2_AND_INT;
2126	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2127	alu.src[1].value = 0x3fffffff;
2128	alu.dst.write = 1;
2129	r600_bytecode_add_alu(ctx.bc, &alu);
2130
2131	/* R0.y = R0.x >> 30 */
2132	memset(&alu, 0, sizeof(alu));
2133	alu.op = ALU_OP2_LSHR_INT;
2134	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2135	alu.src[1].value = 0x1e;
2136	alu.dst.chan = 1;
2137	alu.dst.write = 1;
2138	alu.last = 1;
2139	r600_bytecode_add_alu(ctx.bc, &alu);
2140
2141	/* fetch vertex data from GSVS ring */
2142	for (i = 0; i < ocnt; ++i) {
2143		struct r600_shader_io *out = &ctx.shader->output[i];
2144
2145		out->gpr = i + 1;
2146		out->ring_offset = i * 16;
2147
2148		memset(&vtx, 0, sizeof(vtx));
2149		vtx.op = FETCH_OP_VFETCH;
2150		vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
2151		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2152		vtx.mega_fetch_count = 16;
2153		vtx.offset = out->ring_offset;
2154		vtx.dst_gpr = out->gpr;
2155		vtx.src_gpr = 0;
2156		vtx.dst_sel_x = 0;
2157		vtx.dst_sel_y = 1;
2158		vtx.dst_sel_z = 2;
2159		vtx.dst_sel_w = 3;
2160		if (rctx->b.chip_class >= EVERGREEN) {
2161			vtx.use_const_fields = 1;
2162		} else {
2163			vtx.data_format = FMT_32_32_32_32_FLOAT;
2164		}
2165
2166		r600_bytecode_add_vtx(ctx.bc, &vtx);
2167	}
2168	ctx.temp_reg = i + 1;
2169	for (ring = 3; ring >= 0; --ring) {
2170		bool enabled = false;
2171		for (i = 0; i < so->num_outputs; i++) {
2172			if (so->output[i].stream == ring) {
2173				enabled = true;
2174				break;
2175			}
2176		}
2177		if (ring != 0 && !enabled) {
2178			cshader->shader.ring_item_sizes[ring] = 0;
2179			continue;
2180		}
2181
2182		if (cf_jump) {
2183			// Patch up jump label
2184			r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2185			cf_pop = ctx.bc->cf_last;
2186
2187			cf_jump->cf_addr = cf_pop->id + 2;
2188			cf_jump->pop_count = 1;
2189			cf_pop->cf_addr = cf_pop->id + 2;
2190			cf_pop->pop_count = 1;
2191		}
2192
2193		/* PRED_SETE_INT __, R0.y, ring */
2194		memset(&alu, 0, sizeof(alu));
2195		alu.op = ALU_OP2_PRED_SETE_INT;
2196		alu.src[0].chan = 1;
2197		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2198		alu.src[1].value = ring;
2199		alu.execute_mask = 1;
2200		alu.update_pred = 1;
2201		alu.last = 1;
2202		r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2203
2204		r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
2205		cf_jump = ctx.bc->cf_last;
2206
2207		if (enabled)
2208			emit_streamout(&ctx, so, ring, &cshader->shader.ring_item_sizes[ring]);
2209		cshader->shader.ring_item_sizes[ring] = ocnt * 16;
2210	}
2211
2212	/* bc adds nops - copy it */
2213	if (ctx.bc->chip_class == R600) {
2214		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2215		alu.op = ALU_OP0_NOP;
2216		alu.last = 1;
2217		r600_bytecode_add_alu(ctx.bc, &alu);
2218
2219		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2220	}
2221
2222	/* export vertex data */
2223	/* XXX factor out common code with r600_shader_from_tgsi ? */
2224	for (i = 0; i < ocnt; ++i) {
2225		struct r600_shader_io *out = &ctx.shader->output[i];
2226		bool instream0 = true;
2227		if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
2228			continue;
2229
2230		for (j = 0; j < so->num_outputs; j++) {
2231			if (so->output[j].register_index == i) {
2232				if (so->output[j].stream == 0)
2233					break;
2234				if (so->output[j].stream > 0)
2235					instream0 = false;
2236			}
2237		}
2238		if (!instream0)
2239			continue;
2240		memset(&output, 0, sizeof(output));
2241		output.gpr = out->gpr;
2242		output.elem_size = 3;
2243		output.swizzle_x = 0;
2244		output.swizzle_y = 1;
2245		output.swizzle_z = 2;
2246		output.swizzle_w = 3;
2247		output.burst_count = 1;
2248		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2249		output.op = CF_OP_EXPORT;
2250		switch (out->name) {
2251		case TGSI_SEMANTIC_POSITION:
2252			output.array_base = 60;
2253			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2254			break;
2255
2256		case TGSI_SEMANTIC_PSIZE:
2257			output.array_base = 61;
2258			if (next_clip_pos == 61)
2259				next_clip_pos = 62;
2260			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2261			output.swizzle_y = 7;
2262			output.swizzle_z = 7;
2263			output.swizzle_w = 7;
2264			ctx.shader->vs_out_misc_write = 1;
2265			ctx.shader->vs_out_point_size = 1;
2266			break;
2267		case TGSI_SEMANTIC_LAYER:
2268			if (out->spi_sid) {
2269				/* duplicate it as PARAM to pass to the pixel shader */
2270				output.array_base = next_param++;
2271				r600_bytecode_add_output(ctx.bc, &output);
2272				last_exp_param = ctx.bc->cf_last;
2273			}
2274			output.array_base = 61;
2275			if (next_clip_pos == 61)
2276				next_clip_pos = 62;
2277			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2278			output.swizzle_x = 7;
2279			output.swizzle_y = 7;
2280			output.swizzle_z = 0;
2281			output.swizzle_w = 7;
2282			ctx.shader->vs_out_misc_write = 1;
2283			ctx.shader->vs_out_layer = 1;
2284			break;
2285		case TGSI_SEMANTIC_VIEWPORT_INDEX:
2286			if (out->spi_sid) {
2287				/* duplicate it as PARAM to pass to the pixel shader */
2288				output.array_base = next_param++;
2289				r600_bytecode_add_output(ctx.bc, &output);
2290				last_exp_param = ctx.bc->cf_last;
2291			}
2292			output.array_base = 61;
2293			if (next_clip_pos == 61)
2294				next_clip_pos = 62;
2295			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2296			ctx.shader->vs_out_misc_write = 1;
2297			ctx.shader->vs_out_viewport = 1;
2298			output.swizzle_x = 7;
2299			output.swizzle_y = 7;
2300			output.swizzle_z = 7;
2301			output.swizzle_w = 0;
2302			break;
2303		case TGSI_SEMANTIC_CLIPDIST:
2304			/* spi_sid is 0 for clipdistance outputs that were generated
2305			 * for clipvertex - we don't need to pass them to PS */
2306			ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
2307			if (out->spi_sid) {
2308				/* duplicate it as PARAM to pass to the pixel shader */
2309				output.array_base = next_param++;
2310				r600_bytecode_add_output(ctx.bc, &output);
2311				last_exp_param = ctx.bc->cf_last;
2312			}
2313			output.array_base = next_clip_pos++;
2314			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2315			break;
2316		case TGSI_SEMANTIC_FOG:
2317			output.swizzle_y = 4; /* 0 */
2318			output.swizzle_z = 4; /* 0 */
2319			output.swizzle_w = 5; /* 1 */
2320			break;
2321		default:
2322			output.array_base = next_param++;
2323			break;
2324		}
2325		r600_bytecode_add_output(ctx.bc, &output);
2326		if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
2327			last_exp_param = ctx.bc->cf_last;
2328		else
2329			last_exp_pos = ctx.bc->cf_last;
2330	}
2331
2332	if (!last_exp_pos) {
2333		memset(&output, 0, sizeof(output));
2334		output.gpr = 0;
2335		output.elem_size = 3;
2336		output.swizzle_x = 7;
2337		output.swizzle_y = 7;
2338		output.swizzle_z = 7;
2339		output.swizzle_w = 7;
2340		output.burst_count = 1;
2341		output.type = 2;
2342		output.op = CF_OP_EXPORT;
2343		output.array_base = 60;
2344		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2345		r600_bytecode_add_output(ctx.bc, &output);
2346		last_exp_pos = ctx.bc->cf_last;
2347	}
2348
2349	if (!last_exp_param) {
2350		memset(&output, 0, sizeof(output));
2351		output.gpr = 0;
2352		output.elem_size = 3;
2353		output.swizzle_x = 7;
2354		output.swizzle_y = 7;
2355		output.swizzle_z = 7;
2356		output.swizzle_w = 7;
2357		output.burst_count = 1;
2358		output.type = 2;
2359		output.op = CF_OP_EXPORT;
2360		output.array_base = next_param++;
2361		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2362		r600_bytecode_add_output(ctx.bc, &output);
2363		last_exp_param = ctx.bc->cf_last;
2364	}
2365
2366	last_exp_pos->op = CF_OP_EXPORT_DONE;
2367	last_exp_param->op = CF_OP_EXPORT_DONE;
2368
2369	r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2370	cf_pop = ctx.bc->cf_last;
2371
2372	cf_jump->cf_addr = cf_pop->id + 2;
2373	cf_jump->pop_count = 1;
2374	cf_pop->cf_addr = cf_pop->id + 2;
2375	cf_pop->pop_count = 1;
2376
2377	if (ctx.bc->chip_class == CAYMAN)
2378		cm_bytecode_add_cf_end(ctx.bc);
2379	else {
2380		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2381		ctx.bc->cf_last->end_of_program = 1;
2382	}
2383
2384	gs->gs_copy_shader = cshader;
2385	cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
2386
2387	ctx.bc->nstack = 1;
2388
2389	return r600_bytecode_build(ctx.bc);
2390}
2391
2392static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind)
2393{
2394	if (ind) {
2395		struct r600_bytecode_alu alu;
2396		int r;
2397
2398		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2399		alu.op = ALU_OP2_ADD_INT;
2400		alu.src[0].sel = ctx->gs_export_gpr_tregs[idx];
2401		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2402		alu.src[1].value = ctx->gs_out_ring_offset >> 4;
2403		alu.dst.sel = ctx->gs_export_gpr_tregs[idx];
2404		alu.dst.write = 1;
2405		alu.last = 1;
2406		r = r600_bytecode_add_alu(ctx->bc, &alu);
2407		if (r)
2408			return r;
2409	}
2410	return 0;
2411}
2412
2413static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind)
2414{
2415	struct r600_bytecode_output output;
2416	int i, k, ring_offset;
2417	int effective_stream = stream == -1 ? 0 : stream;
2418	int idx = 0;
2419
2420	for (i = 0; i < ctx->shader->noutput; i++) {
2421		if (ctx->gs_for_vs) {
2422			/* for ES we need to lookup corresponding ring offset expected by GS
2423			 * (map this output to GS input by name and sid) */
2424			/* FIXME precompute offsets */
2425			ring_offset = -1;
2426			for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
2427				struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
2428				struct r600_shader_io *out = &ctx->shader->output[i];
2429				if (in->name == out->name && in->sid == out->sid)
2430					ring_offset = in->ring_offset;
2431			}
2432
2433			if (ring_offset == -1)
2434				continue;
2435		} else {
2436			ring_offset = idx * 16;
2437			idx++;
2438		}
2439
2440		if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
2441			continue;
2442		/* next_ring_offset after parsing input decls contains total size of
2443		 * single vertex data, gs_next_vertex - current vertex index */
2444		if (!ind)
2445			ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
2446
2447		memset(&output, 0, sizeof(struct r600_bytecode_output));
2448		output.gpr = ctx->shader->output[i].gpr;
2449		output.elem_size = 3;
2450		output.comp_mask = 0xF;
2451		output.burst_count = 1;
2452
2453		if (ind)
2454			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
2455		else
2456			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2457
2458		switch (stream) {
2459		default:
2460		case 0:
2461			output.op = CF_OP_MEM_RING; break;
2462		case 1:
2463			output.op = CF_OP_MEM_RING1; break;
2464		case 2:
2465			output.op = CF_OP_MEM_RING2; break;
2466		case 3:
2467			output.op = CF_OP_MEM_RING3; break;
2468		}
2469
2470		if (ind) {
2471			output.array_base = ring_offset >> 2; /* in dwords */
2472			output.array_size = 0xfff;
2473			output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
2474		} else
2475			output.array_base = ring_offset >> 2; /* in dwords */
2476		r600_bytecode_add_output(ctx->bc, &output);
2477	}
2478
2479	++ctx->gs_next_vertex;
2480	return 0;
2481}
2482
2483
2484static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
2485{
2486	int r;
2487	struct r600_bytecode_vtx vtx;
2488	int temp_val = ctx->temp_reg;
2489	/* need to store the TCS output somewhere */
2490	r = single_alu_op2(ctx, ALU_OP1_MOV,
2491			   temp_val, 0,
2492			   V_SQ_ALU_SRC_LITERAL, 0,
2493			   0, 0);
2494	if (r)
2495		return r;
2496
2497	/* used by VS/TCS */
2498	if (ctx->tess_input_info) {
2499		/* fetch tcs input values into resv space */
2500		memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2501		vtx.op = FETCH_OP_VFETCH;
2502		vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2503		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2504		vtx.mega_fetch_count = 16;
2505		vtx.data_format = FMT_32_32_32_32;
2506		vtx.num_format_all = 2;
2507		vtx.format_comp_all = 1;
2508		vtx.use_const_fields = 0;
2509		vtx.endian = r600_endian_swap(32);
2510		vtx.srf_mode_all = 1;
2511		vtx.offset = 0;
2512		vtx.dst_gpr = ctx->tess_input_info;
2513		vtx.dst_sel_x = 0;
2514		vtx.dst_sel_y = 1;
2515		vtx.dst_sel_z = 2;
2516		vtx.dst_sel_w = 3;
2517		vtx.src_gpr = temp_val;
2518		vtx.src_sel_x = 0;
2519
2520		r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2521		if (r)
2522			return r;
2523	}
2524
2525	/* used by TCS/TES */
2526	if (ctx->tess_output_info) {
2527		/* fetch tcs output values into resv space */
2528		memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2529		vtx.op = FETCH_OP_VFETCH;
2530		vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2531		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2532		vtx.mega_fetch_count = 16;
2533		vtx.data_format = FMT_32_32_32_32;
2534		vtx.num_format_all = 2;
2535		vtx.format_comp_all = 1;
2536		vtx.use_const_fields = 0;
2537		vtx.endian = r600_endian_swap(32);
2538		vtx.srf_mode_all = 1;
2539		vtx.offset = 16;
2540		vtx.dst_gpr = ctx->tess_output_info;
2541		vtx.dst_sel_x = 0;
2542		vtx.dst_sel_y = 1;
2543		vtx.dst_sel_z = 2;
2544		vtx.dst_sel_w = 3;
2545		vtx.src_gpr = temp_val;
2546		vtx.src_sel_x = 0;
2547
2548		r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2549		if (r)
2550			return r;
2551	}
2552	return 0;
2553}
2554
2555static int emit_lds_vs_writes(struct r600_shader_ctx *ctx)
2556{
2557	int i, j, r;
2558	int temp_reg;
2559
2560	/* fetch tcs input values into input_vals */
2561	ctx->tess_input_info = r600_get_temp(ctx);
2562	ctx->tess_output_info = 0;
2563	r = r600_fetch_tess_io_info(ctx);
2564	if (r)
2565		return r;
2566
2567	temp_reg = r600_get_temp(ctx);
2568	/* dst reg contains LDS address stride * idx */
2569	/* MUL vertexID, vertex_dw_stride */
2570	r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
2571			   temp_reg, 0,
2572			   ctx->tess_input_info, 1,
2573			   0, 1); /* rel id in r0.y? */
2574	if (r)
2575		return r;
2576
2577	for (i = 0; i < ctx->shader->noutput; i++) {
2578		struct r600_bytecode_alu alu;
2579		int param = r600_get_lds_unique_index(ctx->shader->output[i].name, ctx->shader->output[i].sid);
2580
2581		if (param) {
2582			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2583					   temp_reg, 1,
2584					   temp_reg, 0,
2585					   V_SQ_ALU_SRC_LITERAL, param * 16);
2586			if (r)
2587				return r;
2588		}
2589
2590		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2591				   temp_reg, 2,
2592				   temp_reg, param ? 1 : 0,
2593				   V_SQ_ALU_SRC_LITERAL, 8);
2594		if (r)
2595			return r;
2596
2597
2598		for (j = 0; j < 2; j++) {
2599			int chan = (j == 1) ? 2 : (param ? 1 : 0);
2600			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2601			alu.op = LDS_OP3_LDS_WRITE_REL;
2602			alu.src[0].sel = temp_reg;
2603			alu.src[0].chan = chan;
2604			alu.src[1].sel = ctx->shader->output[i].gpr;
2605			alu.src[1].chan = j * 2;
2606			alu.src[2].sel = ctx->shader->output[i].gpr;
2607			alu.src[2].chan = (j * 2) + 1;
2608			alu.last = 1;
2609			alu.dst.chan = 0;
2610			alu.lds_idx = 1;
2611			alu.is_lds_idx_op = true;
2612			r = r600_bytecode_add_alu(ctx->bc, &alu);
2613			if (r)
2614				return r;
2615		}
2616	}
2617	return 0;
2618}
2619
2620static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
2621{
2622	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2623	const struct tgsi_full_dst_register *dst = &inst->Dst[0];
2624	int i, r, lasti;
2625	int temp_reg = r600_get_temp(ctx);
2626	struct r600_bytecode_alu alu;
2627	unsigned write_mask = dst->Register.WriteMask;
2628
2629	if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
2630		return 0;
2631
2632	r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true);
2633	if (r)
2634		return r;
2635
2636	/* the base address is now in temp.x */
2637	r = r600_get_byte_address(ctx, temp_reg,
2638				  &inst->Dst[0], NULL, ctx->tess_output_info, 1);
2639	if (r)
2640		return r;
2641
2642	/* LDS write */
2643	lasti = tgsi_last_instruction(write_mask);
2644	for (i = 1; i <= lasti; i++) {
2645
2646		if (!(write_mask & (1 << i)))
2647			continue;
2648		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2649				   temp_reg, i,
2650				   temp_reg, 0,
2651				   V_SQ_ALU_SRC_LITERAL, 4 * i);
2652		if (r)
2653			return r;
2654	}
2655
2656	for (i = 0; i <= lasti; i++) {
2657		if (!(write_mask & (1 << i)))
2658			continue;
2659
2660		if ((i == 0 && ((write_mask & 3) == 3)) ||
2661		    (i == 2 && ((write_mask & 0xc) == 0xc))) {
2662			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2663			alu.op = LDS_OP3_LDS_WRITE_REL;
2664			alu.src[0].sel = temp_reg;
2665			alu.src[0].chan = i;
2666
2667			alu.src[1].sel = dst->Register.Index;
2668			alu.src[1].sel += ctx->file_offset[dst->Register.File];
2669			alu.src[1].chan = i;
2670
2671			alu.src[2].sel = dst->Register.Index;
2672			alu.src[2].sel += ctx->file_offset[dst->Register.File];
2673			alu.src[2].chan = i + 1;
2674			alu.lds_idx = 1;
2675			alu.dst.chan = 0;
2676			alu.last = 1;
2677			alu.is_lds_idx_op = true;
2678			r = r600_bytecode_add_alu(ctx->bc, &alu);
2679			if (r)
2680				return r;
2681			i += 1;
2682			continue;
2683		}
2684		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2685		alu.op = LDS_OP2_LDS_WRITE;
2686		alu.src[0].sel = temp_reg;
2687		alu.src[0].chan = i;
2688
2689		alu.src[1].sel = dst->Register.Index;
2690		alu.src[1].sel += ctx->file_offset[dst->Register.File];
2691		alu.src[1].chan = i;
2692
2693		alu.src[2].sel = V_SQ_ALU_SRC_0;
2694		alu.dst.chan = 0;
2695		alu.last = 1;
2696		alu.is_lds_idx_op = true;
2697		r = r600_bytecode_add_alu(ctx->bc, &alu);
2698		if (r)
2699			return r;
2700	}
2701	return 0;
2702}
2703
2704static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
2705				 int output_idx)
2706{
2707	int param;
2708	unsigned temp_reg = r600_get_temp(ctx);
2709	unsigned name = ctx->shader->output[output_idx].name;
2710	int dreg = ctx->shader->output[output_idx].gpr;
2711	int r;
2712
2713	param = r600_get_lds_unique_index(name, 0);
2714	r = get_lds_offset0(ctx, 1, temp_reg, true);
2715	if (r)
2716		return r;
2717
2718	r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2719			   temp_reg, 0,
2720			   temp_reg, 0,
2721			   V_SQ_ALU_SRC_LITERAL, param * 16);
2722	if (r)
2723		return r;
2724
2725	do_lds_fetch_values(ctx, temp_reg, dreg);
2726	return 0;
2727}
2728
2729static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)
2730{
2731	int i;
2732	int stride, outer_comps, inner_comps;
2733	int tessinner_idx = -1, tessouter_idx = -1;
2734	int r;
2735	int temp_reg = r600_get_temp(ctx);
2736	int treg[3] = {-1, -1, -1};
2737	struct r600_bytecode_alu alu;
2738	struct r600_bytecode_cf *cf_jump, *cf_pop;
2739
2740	/* only execute factor emission for invocation 0 */
2741	/* PRED_SETE_INT __, R0.x, 0 */
2742	memset(&alu, 0, sizeof(alu));
2743	alu.op = ALU_OP2_PRED_SETE_INT;
2744	alu.src[0].chan = 2;
2745	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2746	alu.execute_mask = 1;
2747	alu.update_pred = 1;
2748	alu.last = 1;
2749	r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2750
2751	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
2752	cf_jump = ctx->bc->cf_last;
2753
2754	treg[0] = r600_get_temp(ctx);
2755	switch (ctx->shader->tcs_prim_mode) {
2756	case PIPE_PRIM_LINES:
2757		stride = 8; /* 2 dwords, 1 vec2 store */
2758		outer_comps = 2;
2759		inner_comps = 0;
2760		break;
2761	case PIPE_PRIM_TRIANGLES:
2762		stride = 16; /* 4 dwords, 1 vec4 store */
2763		outer_comps = 3;
2764		inner_comps = 1;
2765		treg[1] = r600_get_temp(ctx);
2766		break;
2767	case PIPE_PRIM_QUADS:
2768		stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */
2769		outer_comps = 4;
2770		inner_comps = 2;
2771		treg[1] = r600_get_temp(ctx);
2772		treg[2] = r600_get_temp(ctx);
2773		break;
2774	default:
2775		assert(0);
2776		return -1;
2777	}
2778
2779	/* R0 is InvocationID, RelPatchID, PatchID, tf_base */
2780	/* TF_WRITE takes index in R.x, value in R.y */
2781	for (i = 0; i < ctx->shader->noutput; i++) {
2782		if (ctx->shader->output[i].name == TGSI_SEMANTIC_TESSINNER)
2783			tessinner_idx = i;
2784		if (ctx->shader->output[i].name == TGSI_SEMANTIC_TESSOUTER)
2785			tessouter_idx = i;
2786	}
2787
2788	if (tessouter_idx == -1)
2789		return -1;
2790
2791	if (tessinner_idx == -1 && inner_comps)
2792		return -1;
2793
2794	if (tessouter_idx != -1) {
2795		r = r600_tess_factor_read(ctx, tessouter_idx);
2796		if (r)
2797			return r;
2798	}
2799
2800	if (tessinner_idx != -1) {
2801		r = r600_tess_factor_read(ctx, tessinner_idx);
2802		if (r)
2803			return r;
2804	}
2805
2806	/* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */
2807	/* r.x = relpatchid(r0.y) * tf_stride */
2808
2809	/* multiply incoming r0.y * stride - t.x = r0.y * stride */
2810	/* add incoming r0.w to it: t.x = t.x + r0.w */
2811	r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
2812			   temp_reg, 0,
2813			   0, 1,
2814			   V_SQ_ALU_SRC_LITERAL, stride,
2815			   0, 3);
2816	if (r)
2817		return r;
2818
2819	for (i = 0; i < outer_comps + inner_comps; i++) {
2820		int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx;
2821		int out_comp = i >= outer_comps ? i - outer_comps : i;
2822
2823		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2824				   treg[i / 2], (2 * (i % 2)),
2825				   temp_reg, 0,
2826				   V_SQ_ALU_SRC_LITERAL, 4 * i);
2827		if (r)
2828			return r;
2829		r = single_alu_op2(ctx, ALU_OP1_MOV,
2830				   treg[i / 2], 1 + (2 * (i%2)),
2831				   ctx->shader->output[out_idx].gpr, out_comp,
2832				   0, 0);
2833		if (r)
2834			return r;
2835	}
2836	for (i = 0; i < outer_comps + inner_comps; i++) {
2837		struct r600_bytecode_gds gds;
2838
2839		memset(&gds, 0, sizeof(struct r600_bytecode_gds));
2840		gds.src_gpr = treg[i / 2];
2841		gds.src_sel_x = 2 * (i % 2);
2842		gds.src_sel_y = 1 + (2 * (i % 2));
2843		gds.src_sel_z = 4;
2844		gds.dst_sel_x = 7;
2845		gds.dst_sel_y = 7;
2846		gds.dst_sel_z = 7;
2847		gds.dst_sel_w = 7;
2848		gds.op = FETCH_OP_TF_WRITE;
2849		r = r600_bytecode_add_gds(ctx->bc, &gds);
2850		if (r)
2851			return r;
2852	}
2853
2854	// Patch up jump label
2855	r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
2856	cf_pop = ctx->bc->cf_last;
2857
2858	cf_jump->cf_addr = cf_pop->id + 2;
2859	cf_jump->pop_count = 1;
2860	cf_pop->cf_addr = cf_pop->id + 2;
2861	cf_pop->pop_count = 1;
2862
2863	return 0;
2864}
2865
2866static int r600_shader_from_tgsi(struct r600_context *rctx,
2867				 struct r600_pipe_shader *pipeshader,
2868				 union r600_shader_key key)
2869{
2870	struct r600_screen *rscreen = rctx->screen;
2871	struct r600_shader *shader = &pipeshader->shader;
2872	struct tgsi_token *tokens = pipeshader->selector->tokens;
2873	struct pipe_stream_output_info so = pipeshader->selector->so;
2874	struct tgsi_full_immediate *immediate;
2875	struct r600_shader_ctx ctx;
2876	struct r600_bytecode_output output[32];
2877	unsigned output_done, noutput;
2878	unsigned opcode;
2879	int i, j, k, r = 0;
2880	int next_param_base = 0, next_clip_base;
2881	int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
2882	/* Declarations used by llvm code */
2883	bool use_llvm = false;
2884	bool indirect_gprs;
2885	bool ring_outputs = false;
2886	bool lds_outputs = false;
2887	bool lds_inputs = false;
2888	bool pos_emitted = false;
2889
2890#ifdef R600_USE_LLVM
2891	use_llvm = rscreen->b.debug_flags & DBG_LLVM;
2892#endif
2893	ctx.bc = &shader->bc;
2894	ctx.shader = shader;
2895	ctx.native_integers = true;
2896
2897
2898	r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
2899			   rscreen->has_compressed_msaa_texturing);
2900	ctx.tokens = tokens;
2901	tgsi_scan_shader(tokens, &ctx.info);
2902	shader->indirect_files = ctx.info.indirect_files;
2903
2904	shader->uses_doubles = ctx.info.uses_doubles;
2905
2906	indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
2907	tgsi_parse_init(&ctx.parse, tokens);
2908	ctx.type = ctx.info.processor;
2909	shader->processor_type = ctx.type;
2910	ctx.bc->type = shader->processor_type;
2911
2912	switch (ctx.type) {
2913	case TGSI_PROCESSOR_VERTEX:
2914		shader->vs_as_gs_a = key.vs.as_gs_a;
2915		shader->vs_as_es = key.vs.as_es;
2916		shader->vs_as_ls = key.vs.as_ls;
2917		if (shader->vs_as_es)
2918			ring_outputs = true;
2919		if (shader->vs_as_ls)
2920			lds_outputs = true;
2921		break;
2922	case TGSI_PROCESSOR_GEOMETRY:
2923		ring_outputs = true;
2924		break;
2925	case TGSI_PROCESSOR_TESS_CTRL:
2926		shader->tcs_prim_mode = key.tcs.prim_mode;
2927		lds_outputs = true;
2928		lds_inputs = true;
2929		break;
2930	case TGSI_PROCESSOR_TESS_EVAL:
2931		shader->tes_as_es = key.tes.as_es;
2932		lds_inputs = true;
2933		if (shader->tes_as_es)
2934			ring_outputs = true;
2935		break;
2936	case TGSI_PROCESSOR_FRAGMENT:
2937		shader->two_side = key.ps.color_two_side;
2938		break;
2939	default:
2940		break;
2941	}
2942
2943	if (shader->vs_as_es || shader->tes_as_es) {
2944		ctx.gs_for_vs = &rctx->gs_shader->current->shader;
2945	} else {
2946		ctx.gs_for_vs = NULL;
2947	}
2948
2949	ctx.next_ring_offset = 0;
2950	ctx.gs_out_ring_offset = 0;
2951	ctx.gs_next_vertex = 0;
2952	ctx.gs_stream_output_info = &so;
2953
2954	ctx.face_gpr = -1;
2955	ctx.fixed_pt_position_gpr = -1;
2956	ctx.fragcoord_input = -1;
2957	ctx.colors_used = 0;
2958	ctx.clip_vertex_write = 0;
2959
2960	shader->nr_ps_color_exports = 0;
2961	shader->nr_ps_max_color_exports = 0;
2962
2963
2964	/* register allocations */
2965	/* Values [0,127] correspond to GPR[0..127].
2966	 * Values [128,159] correspond to constant buffer bank 0
2967	 * Values [160,191] correspond to constant buffer bank 1
2968	 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
2969	 * Values [256,287] correspond to constant buffer bank 2 (EG)
2970	 * Values [288,319] correspond to constant buffer bank 3 (EG)
2971	 * Other special values are shown in the list below.
2972	 * 244  ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
2973	 * 245  ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
2974	 * 246  ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
2975	 * 247  ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
2976	 * 248	SQ_ALU_SRC_0: special constant 0.0.
2977	 * 249	SQ_ALU_SRC_1: special constant 1.0 float.
2978	 * 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
2979	 * 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
2980	 * 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
2981	 * 253	SQ_ALU_SRC_LITERAL: literal constant.
2982	 * 254	SQ_ALU_SRC_PV: previous vector result.
2983	 * 255	SQ_ALU_SRC_PS: previous scalar result.
2984	 */
2985	for (i = 0; i < TGSI_FILE_COUNT; i++) {
2986		ctx.file_offset[i] = 0;
2987	}
2988
2989#ifdef R600_USE_LLVM
2990	if (use_llvm && ctx.info.indirect_files && (ctx.info.indirect_files & (1 << TGSI_FILE_CONSTANT)) != ctx.info.indirect_files) {
2991		fprintf(stderr, "Warning: R600 LLVM backend does not support "
2992				"indirect adressing.  Falling back to TGSI "
2993				"backend.\n");
2994		use_llvm = 0;
2995	}
2996#endif
2997	if (ctx.type == TGSI_PROCESSOR_VERTEX) {
2998		ctx.file_offset[TGSI_FILE_INPUT] = 1;
2999		if (!use_llvm) {
3000			r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
3001		}
3002	}
3003	if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
3004		if (ctx.bc->chip_class >= EVERGREEN)
3005			ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
3006		else
3007			ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
3008	}
3009	if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
3010		/* FIXME 1 would be enough in some cases (3 or less input vertices) */
3011		ctx.file_offset[TGSI_FILE_INPUT] = 2;
3012	}
3013	if (ctx.type == TGSI_PROCESSOR_TESS_CTRL)
3014		ctx.file_offset[TGSI_FILE_INPUT] = 1;
3015	if (ctx.type == TGSI_PROCESSOR_TESS_EVAL) {
3016		bool add_tesscoord = false, add_tess_inout = false;
3017		ctx.file_offset[TGSI_FILE_INPUT] = 1;
3018		for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3019			/* if we have tesscoord save one reg */
3020			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSCOORD)
3021				add_tesscoord = true;
3022			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSINNER ||
3023			    ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSOUTER)
3024				add_tess_inout = true;
3025		}
3026		if (add_tesscoord || add_tess_inout)
3027			ctx.file_offset[TGSI_FILE_INPUT]++;
3028		if (add_tess_inout)
3029			ctx.file_offset[TGSI_FILE_INPUT]+=2;
3030	}
3031	ctx.use_llvm = use_llvm;
3032
3033	if (use_llvm) {
3034		ctx.file_offset[TGSI_FILE_OUTPUT] =
3035			ctx.file_offset[TGSI_FILE_INPUT];
3036	} else {
3037	   ctx.file_offset[TGSI_FILE_OUTPUT] =
3038			ctx.file_offset[TGSI_FILE_INPUT] +
3039			ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3040	}
3041	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
3042						ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
3043
3044	/* Outside the GPR range. This will be translated to one of the
3045	 * kcache banks later. */
3046	ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
3047
3048	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
3049	ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
3050			ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
3051	ctx.bc->index_reg[0] = ctx.bc->ar_reg + 1;
3052	ctx.bc->index_reg[1] = ctx.bc->ar_reg + 2;
3053
3054	if (ctx.type == TGSI_PROCESSOR_TESS_CTRL) {
3055		ctx.tess_input_info = ctx.bc->ar_reg + 3;
3056		ctx.tess_output_info = ctx.bc->ar_reg + 4;
3057		ctx.temp_reg = ctx.bc->ar_reg + 5;
3058	} else if (ctx.type == TGSI_PROCESSOR_TESS_EVAL) {
3059		ctx.tess_input_info = 0;
3060		ctx.tess_output_info = ctx.bc->ar_reg + 3;
3061		ctx.temp_reg = ctx.bc->ar_reg + 4;
3062	} else if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
3063		ctx.gs_export_gpr_tregs[0] = ctx.bc->ar_reg + 3;
3064		ctx.gs_export_gpr_tregs[1] = ctx.bc->ar_reg + 4;
3065		ctx.gs_export_gpr_tregs[2] = ctx.bc->ar_reg + 5;
3066		ctx.gs_export_gpr_tregs[3] = ctx.bc->ar_reg + 6;
3067		ctx.temp_reg = ctx.bc->ar_reg + 7;
3068	} else {
3069		ctx.temp_reg = ctx.bc->ar_reg + 3;
3070	}
3071
3072	shader->max_arrays = 0;
3073	shader->num_arrays = 0;
3074	if (indirect_gprs) {
3075
3076		if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
3077			r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
3078			                   ctx.file_offset[TGSI_FILE_OUTPUT] -
3079			                   ctx.file_offset[TGSI_FILE_INPUT],
3080			                   0x0F);
3081		}
3082		if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
3083			r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
3084			                   ctx.file_offset[TGSI_FILE_TEMPORARY] -
3085			                   ctx.file_offset[TGSI_FILE_OUTPUT],
3086			                   0x0F);
3087		}
3088	}
3089
3090	ctx.nliterals = 0;
3091	ctx.literals = NULL;
3092
3093	shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS];
3094	shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
3095	shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
3096
3097	if (shader->vs_as_gs_a)
3098		vs_add_primid_output(&ctx, key.vs.prim_id_out);
3099
3100	if (ctx.type == TGSI_PROCESSOR_TESS_EVAL)
3101		r600_fetch_tess_io_info(&ctx);
3102
3103	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3104		tgsi_parse_token(&ctx.parse);
3105		switch (ctx.parse.FullToken.Token.Type) {
3106		case TGSI_TOKEN_TYPE_IMMEDIATE:
3107			immediate = &ctx.parse.FullToken.FullImmediate;
3108			ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
3109			if(ctx.literals == NULL) {
3110				r = -ENOMEM;
3111				goto out_err;
3112			}
3113			ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
3114			ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
3115			ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
3116			ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
3117			ctx.nliterals++;
3118			break;
3119		case TGSI_TOKEN_TYPE_DECLARATION:
3120			r = tgsi_declaration(&ctx);
3121			if (r)
3122				goto out_err;
3123			break;
3124		case TGSI_TOKEN_TYPE_INSTRUCTION:
3125		case TGSI_TOKEN_TYPE_PROPERTY:
3126			break;
3127		default:
3128			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
3129			r = -EINVAL;
3130			goto out_err;
3131		}
3132	}
3133
3134	shader->ring_item_sizes[0] = ctx.next_ring_offset;
3135	shader->ring_item_sizes[1] = 0;
3136	shader->ring_item_sizes[2] = 0;
3137	shader->ring_item_sizes[3] = 0;
3138
3139	/* Process two side if needed */
3140	if (shader->two_side && ctx.colors_used) {
3141		int i, count = ctx.shader->ninput;
3142		unsigned next_lds_loc = ctx.shader->nlds;
3143
3144		/* additional inputs will be allocated right after the existing inputs,
3145		 * we won't need them after the color selection, so we don't need to
3146		 * reserve these gprs for the rest of the shader code and to adjust
3147		 * output offsets etc. */
3148		int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
3149				ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3150
3151		/* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
3152		if (ctx.face_gpr == -1) {
3153			i = ctx.shader->ninput++;
3154			ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
3155			ctx.shader->input[i].spi_sid = 0;
3156			ctx.shader->input[i].gpr = gpr++;
3157			ctx.face_gpr = ctx.shader->input[i].gpr;
3158		}
3159
3160		for (i = 0; i < count; i++) {
3161			if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
3162				int ni = ctx.shader->ninput++;
3163				memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
3164				ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
3165				ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
3166				ctx.shader->input[ni].gpr = gpr++;
3167				// TGSI to LLVM needs to know the lds position of inputs.
3168				// Non LLVM path computes it later (in process_twoside_color)
3169				ctx.shader->input[ni].lds_pos = next_lds_loc++;
3170				ctx.shader->input[i].back_color_input = ni;
3171				if (ctx.bc->chip_class >= EVERGREEN) {
3172					if ((r = evergreen_interp_input(&ctx, ni)))
3173						return r;
3174				}
3175			}
3176		}
3177	}
3178
3179/* LLVM backend setup */
3180#ifdef R600_USE_LLVM
3181	if (use_llvm) {
3182		struct radeon_llvm_context radeon_llvm_ctx;
3183		LLVMModuleRef mod;
3184		bool dump = r600_can_dump_shader(&rscreen->b, tokens);
3185		boolean use_kill = false;
3186
3187		memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
3188		radeon_llvm_ctx.type = ctx.type;
3189		radeon_llvm_ctx.two_side = shader->two_side;
3190		radeon_llvm_ctx.face_gpr = ctx.face_gpr;
3191		radeon_llvm_ctx.inputs_count = ctx.shader->ninput + 1;
3192		radeon_llvm_ctx.r600_inputs = ctx.shader->input;
3193		radeon_llvm_ctx.r600_outputs = ctx.shader->output;
3194		radeon_llvm_ctx.color_buffer_count = max_color_exports;
3195		radeon_llvm_ctx.chip_class = ctx.bc->chip_class;
3196		radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN);
3197		radeon_llvm_ctx.stream_outputs = &so;
3198		radeon_llvm_ctx.alpha_to_one = key.ps.alpha_to_one;
3199		radeon_llvm_ctx.has_compressed_msaa_texturing =
3200			ctx.bc->has_compressed_msaa_texturing;
3201		mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
3202		ctx.shader->has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp;
3203		ctx.shader->uses_tex_buffers = radeon_llvm_ctx.uses_tex_buffers;
3204
3205		if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill, dump)) {
3206			radeon_llvm_dispose(&radeon_llvm_ctx);
3207			use_llvm = 0;
3208			fprintf(stderr, "R600 LLVM backend failed to compile "
3209				"shader.  Falling back to TGSI\n");
3210		} else {
3211			ctx.file_offset[TGSI_FILE_OUTPUT] =
3212					ctx.file_offset[TGSI_FILE_INPUT];
3213		}
3214		if (use_kill)
3215			ctx.shader->uses_kill = use_kill;
3216		radeon_llvm_dispose(&radeon_llvm_ctx);
3217	}
3218#endif
3219/* End of LLVM backend setup */
3220
3221	if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
3222		shader->nr_ps_max_color_exports = 8;
3223
3224	if (!use_llvm) {
3225		if (ctx.fragcoord_input >= 0) {
3226			if (ctx.bc->chip_class == CAYMAN) {
3227				for (j = 0 ; j < 4; j++) {
3228					struct r600_bytecode_alu alu;
3229					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3230					alu.op = ALU_OP1_RECIP_IEEE;
3231					alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3232					alu.src[0].chan = 3;
3233
3234					alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3235					alu.dst.chan = j;
3236					alu.dst.write = (j == 3);
3237					alu.last = 1;
3238					if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3239						return r;
3240				}
3241			} else {
3242				struct r600_bytecode_alu alu;
3243				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3244				alu.op = ALU_OP1_RECIP_IEEE;
3245				alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3246				alu.src[0].chan = 3;
3247
3248				alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3249				alu.dst.chan = 3;
3250				alu.dst.write = 1;
3251				alu.last = 1;
3252				if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3253					return r;
3254			}
3255		}
3256
3257		if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
3258			struct r600_bytecode_alu alu;
3259			int r;
3260
3261			/* GS thread with no output workaround - emit a cut at start of GS */
3262			if (ctx.bc->chip_class == R600)
3263				r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
3264
3265			for (j = 0; j < 4; j++) {
3266				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3267				alu.op = ALU_OP1_MOV;
3268				alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3269				alu.src[0].value = 0;
3270				alu.dst.sel = ctx.gs_export_gpr_tregs[j];
3271				alu.dst.write = 1;
3272				alu.last = 1;
3273				r = r600_bytecode_add_alu(ctx.bc, &alu);
3274				if (r)
3275					return r;
3276			}
3277		}
3278
3279		if (ctx.type == TGSI_PROCESSOR_TESS_CTRL)
3280			r600_fetch_tess_io_info(&ctx);
3281
3282		if (shader->two_side && ctx.colors_used) {
3283			if ((r = process_twoside_color_inputs(&ctx)))
3284				return r;
3285		}
3286
3287		tgsi_parse_init(&ctx.parse, tokens);
3288		while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3289			tgsi_parse_token(&ctx.parse);
3290			switch (ctx.parse.FullToken.Token.Type) {
3291			case TGSI_TOKEN_TYPE_INSTRUCTION:
3292				r = tgsi_is_supported(&ctx);
3293				if (r)
3294					goto out_err;
3295				ctx.max_driver_temp_used = 0;
3296				/* reserve first tmp for everyone */
3297				r600_get_temp(&ctx);
3298
3299				opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
3300				if ((r = tgsi_split_constant(&ctx)))
3301					goto out_err;
3302				if ((r = tgsi_split_literal_constant(&ctx)))
3303					goto out_err;
3304				if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
3305					if ((r = tgsi_split_gs_inputs(&ctx)))
3306						goto out_err;
3307				} else if (lds_inputs) {
3308					if ((r = tgsi_split_lds_inputs(&ctx)))
3309						goto out_err;
3310				}
3311				if (ctx.bc->chip_class == CAYMAN)
3312					ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
3313				else if (ctx.bc->chip_class >= EVERGREEN)
3314					ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
3315				else
3316					ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
3317				r = ctx.inst_info->process(&ctx);
3318				if (r)
3319					goto out_err;
3320
3321				if (ctx.type == TGSI_PROCESSOR_TESS_CTRL) {
3322					r = r600_store_tcs_output(&ctx);
3323					if (r)
3324						goto out_err;
3325				}
3326				break;
3327			default:
3328				break;
3329			}
3330		}
3331	}
3332
3333	/* Reset the temporary register counter. */
3334	ctx.max_driver_temp_used = 0;
3335
3336	noutput = shader->noutput;
3337
3338	if (!ring_outputs && ctx.clip_vertex_write) {
3339		unsigned clipdist_temp[2];
3340
3341		clipdist_temp[0] = r600_get_temp(&ctx);
3342		clipdist_temp[1] = r600_get_temp(&ctx);
3343
3344		/* need to convert a clipvertex write into clipdistance writes and not export
3345		   the clip vertex anymore */
3346
3347		memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
3348		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
3349		shader->output[noutput].gpr = clipdist_temp[0];
3350		noutput++;
3351		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
3352		shader->output[noutput].gpr = clipdist_temp[1];
3353		noutput++;
3354
3355		/* reset spi_sid for clipvertex output to avoid confusing spi */
3356		shader->output[ctx.cv_output].spi_sid = 0;
3357
3358		shader->clip_dist_write = 0xFF;
3359
3360		for (i = 0; i < 8; i++) {
3361			int oreg = i >> 2;
3362			int ochan = i & 3;
3363
3364			for (j = 0; j < 4; j++) {
3365				struct r600_bytecode_alu alu;
3366				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3367				alu.op = ALU_OP2_DOT4;
3368				alu.src[0].sel = shader->output[ctx.cv_output].gpr;
3369				alu.src[0].chan = j;
3370
3371				alu.src[1].sel = 512 + i;
3372				alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
3373				alu.src[1].chan = j;
3374
3375				alu.dst.sel = clipdist_temp[oreg];
3376				alu.dst.chan = j;
3377				alu.dst.write = (j == ochan);
3378				if (j == 3)
3379					alu.last = 1;
3380				if (!use_llvm)
3381					r = r600_bytecode_add_alu(ctx.bc, &alu);
3382				if (r)
3383					return r;
3384			}
3385		}
3386	}
3387
3388	/* Add stream outputs. */
3389	if (!use_llvm && so.num_outputs) {
3390		bool emit = false;
3391		if (!lds_outputs && !ring_outputs && ctx.type == TGSI_PROCESSOR_VERTEX)
3392			emit = true;
3393		if (!ring_outputs && ctx.type == TGSI_PROCESSOR_TESS_EVAL)
3394			emit = true;
3395		if (emit)
3396			emit_streamout(&ctx, &so, -1, NULL);
3397	}
3398	pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
3399	convert_edgeflag_to_int(&ctx);
3400
3401	if (ctx.type == TGSI_PROCESSOR_TESS_CTRL)
3402		r600_emit_tess_factor(&ctx);
3403
3404	if (lds_outputs) {
3405		if (ctx.type == TGSI_PROCESSOR_VERTEX) {
3406			if (ctx.shader->noutput)
3407				emit_lds_vs_writes(&ctx);
3408		}
3409	} else if (ring_outputs) {
3410		if (shader->vs_as_es || shader->tes_as_es) {
3411			ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
3412			ctx.gs_export_gpr_tregs[1] = -1;
3413			ctx.gs_export_gpr_tregs[2] = -1;
3414			ctx.gs_export_gpr_tregs[3] = -1;
3415
3416			emit_gs_ring_writes(&ctx, &so, -1, FALSE);
3417		}
3418	} else {
3419		/* Export output */
3420		next_clip_base = shader->vs_out_misc_write ? 62 : 61;
3421
3422		for (i = 0, j = 0; i < noutput; i++, j++) {
3423			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3424			output[j].gpr = shader->output[i].gpr;
3425			output[j].elem_size = 3;
3426			output[j].swizzle_x = 0;
3427			output[j].swizzle_y = 1;
3428			output[j].swizzle_z = 2;
3429			output[j].swizzle_w = 3;
3430			output[j].burst_count = 1;
3431			output[j].type = -1;
3432			output[j].op = CF_OP_EXPORT;
3433			switch (ctx.type) {
3434			case TGSI_PROCESSOR_VERTEX:
3435			case TGSI_PROCESSOR_TESS_EVAL:
3436				switch (shader->output[i].name) {
3437				case TGSI_SEMANTIC_POSITION:
3438					output[j].array_base = 60;
3439					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3440					pos_emitted = true;
3441					break;
3442
3443				case TGSI_SEMANTIC_PSIZE:
3444					output[j].array_base = 61;
3445					output[j].swizzle_y = 7;
3446					output[j].swizzle_z = 7;
3447					output[j].swizzle_w = 7;
3448					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3449					pos_emitted = true;
3450					break;
3451				case TGSI_SEMANTIC_EDGEFLAG:
3452					output[j].array_base = 61;
3453					output[j].swizzle_x = 7;
3454					output[j].swizzle_y = 0;
3455					output[j].swizzle_z = 7;
3456					output[j].swizzle_w = 7;
3457					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3458					pos_emitted = true;
3459					break;
3460				case TGSI_SEMANTIC_LAYER:
3461					/* spi_sid is 0 for outputs that are
3462					 * not consumed by PS */
3463					if (shader->output[i].spi_sid) {
3464						output[j].array_base = next_param_base++;
3465						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3466						j++;
3467						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3468					}
3469					output[j].array_base = 61;
3470					output[j].swizzle_x = 7;
3471					output[j].swizzle_y = 7;
3472					output[j].swizzle_z = 0;
3473					output[j].swizzle_w = 7;
3474					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3475					pos_emitted = true;
3476					break;
3477				case TGSI_SEMANTIC_VIEWPORT_INDEX:
3478					/* spi_sid is 0 for outputs that are
3479					 * not consumed by PS */
3480					if (shader->output[i].spi_sid) {
3481						output[j].array_base = next_param_base++;
3482						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3483						j++;
3484						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3485					}
3486					output[j].array_base = 61;
3487					output[j].swizzle_x = 7;
3488					output[j].swizzle_y = 7;
3489					output[j].swizzle_z = 7;
3490					output[j].swizzle_w = 0;
3491					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3492					pos_emitted = true;
3493					break;
3494				case TGSI_SEMANTIC_CLIPVERTEX:
3495					j--;
3496					break;
3497				case TGSI_SEMANTIC_CLIPDIST:
3498					output[j].array_base = next_clip_base++;
3499					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3500					pos_emitted = true;
3501					/* spi_sid is 0 for clipdistance outputs that were generated
3502					 * for clipvertex - we don't need to pass them to PS */
3503					if (shader->output[i].spi_sid) {
3504						j++;
3505						/* duplicate it as PARAM to pass to the pixel shader */
3506						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3507						output[j].array_base = next_param_base++;
3508						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3509					}
3510					break;
3511				case TGSI_SEMANTIC_FOG:
3512					output[j].swizzle_y = 4; /* 0 */
3513					output[j].swizzle_z = 4; /* 0 */
3514					output[j].swizzle_w = 5; /* 1 */
3515					break;
3516				case TGSI_SEMANTIC_PRIMID:
3517					output[j].swizzle_x = 2;
3518					output[j].swizzle_y = 4; /* 0 */
3519					output[j].swizzle_z = 4; /* 0 */
3520					output[j].swizzle_w = 4; /* 0 */
3521					break;
3522				}
3523
3524				break;
3525			case TGSI_PROCESSOR_FRAGMENT:
3526				if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
3527					/* never export more colors than the number of CBs */
3528					if (shader->output[i].sid >= max_color_exports) {
3529						/* skip export */
3530						j--;
3531						continue;
3532					}
3533					output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
3534					output[j].array_base = shader->output[i].sid;
3535					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3536					shader->nr_ps_color_exports++;
3537					if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
3538						for (k = 1; k < max_color_exports; k++) {
3539							j++;
3540							memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3541							output[j].gpr = shader->output[i].gpr;
3542							output[j].elem_size = 3;
3543							output[j].swizzle_x = 0;
3544							output[j].swizzle_y = 1;
3545							output[j].swizzle_z = 2;
3546							output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
3547							output[j].burst_count = 1;
3548							output[j].array_base = k;
3549							output[j].op = CF_OP_EXPORT;
3550							output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3551							shader->nr_ps_color_exports++;
3552						}
3553					}
3554				} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
3555					output[j].array_base = 61;
3556					output[j].swizzle_x = 2;
3557					output[j].swizzle_y = 7;
3558					output[j].swizzle_z = output[j].swizzle_w = 7;
3559					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3560				} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
3561					output[j].array_base = 61;
3562					output[j].swizzle_x = 7;
3563					output[j].swizzle_y = 1;
3564					output[j].swizzle_z = output[j].swizzle_w = 7;
3565					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3566				} else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
3567					output[j].array_base = 61;
3568					output[j].swizzle_x = 7;
3569					output[j].swizzle_y = 7;
3570					output[j].swizzle_z = 0;
3571					output[j].swizzle_w = 7;
3572					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3573				} else {
3574					R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
3575					r = -EINVAL;
3576					goto out_err;
3577				}
3578				break;
3579			case TGSI_PROCESSOR_TESS_CTRL:
3580				break;
3581			default:
3582				R600_ERR("unsupported processor type %d\n", ctx.type);
3583				r = -EINVAL;
3584				goto out_err;
3585			}
3586
3587			if (output[j].type==-1) {
3588				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3589				output[j].array_base = next_param_base++;
3590			}
3591		}
3592
3593		/* add fake position export */
3594		if ((ctx.type == TGSI_PROCESSOR_VERTEX || ctx.type == TGSI_PROCESSOR_TESS_EVAL) && pos_emitted == false) {
3595			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3596			output[j].gpr = 0;
3597			output[j].elem_size = 3;
3598			output[j].swizzle_x = 7;
3599			output[j].swizzle_y = 7;
3600			output[j].swizzle_z = 7;
3601			output[j].swizzle_w = 7;
3602			output[j].burst_count = 1;
3603			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3604			output[j].array_base = 60;
3605			output[j].op = CF_OP_EXPORT;
3606			j++;
3607		}
3608
3609		/* add fake param output for vertex shader if no param is exported */
3610		if ((ctx.type == TGSI_PROCESSOR_VERTEX || ctx.type == TGSI_PROCESSOR_TESS_EVAL) && next_param_base == 0) {
3611			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3612			output[j].gpr = 0;
3613			output[j].elem_size = 3;
3614			output[j].swizzle_x = 7;
3615			output[j].swizzle_y = 7;
3616			output[j].swizzle_z = 7;
3617			output[j].swizzle_w = 7;
3618			output[j].burst_count = 1;
3619			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3620			output[j].array_base = 0;
3621			output[j].op = CF_OP_EXPORT;
3622			j++;
3623		}
3624
3625		/* add fake pixel export */
3626		if (ctx.type == TGSI_PROCESSOR_FRAGMENT && shader->nr_ps_color_exports == 0) {
3627			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3628			output[j].gpr = 0;
3629			output[j].elem_size = 3;
3630			output[j].swizzle_x = 7;
3631			output[j].swizzle_y = 7;
3632			output[j].swizzle_z = 7;
3633			output[j].swizzle_w = 7;
3634			output[j].burst_count = 1;
3635			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3636			output[j].array_base = 0;
3637			output[j].op = CF_OP_EXPORT;
3638			j++;
3639			shader->nr_ps_color_exports++;
3640		}
3641
3642		noutput = j;
3643
3644		/* set export done on last export of each type */
3645		for (i = noutput - 1, output_done = 0; i >= 0; i--) {
3646			if (!(output_done & (1 << output[i].type))) {
3647				output_done |= (1 << output[i].type);
3648				output[i].op = CF_OP_EXPORT_DONE;
3649			}
3650		}
3651		/* add output to bytecode */
3652		if (!use_llvm) {
3653			for (i = 0; i < noutput; i++) {
3654				r = r600_bytecode_add_output(ctx.bc, &output[i]);
3655				if (r)
3656					goto out_err;
3657			}
3658		}
3659	}
3660
3661	/* add program end */
3662	if (!use_llvm) {
3663		if (ctx.bc->chip_class == CAYMAN)
3664			cm_bytecode_add_cf_end(ctx.bc);
3665		else {
3666			const struct cf_op_info *last = NULL;
3667
3668			if (ctx.bc->cf_last)
3669				last = r600_isa_cf(ctx.bc->cf_last->op);
3670
3671			/* alu clause instructions don't have EOP bit, so add NOP */
3672			if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_CALL_FS || ctx.bc->cf_last->op == CF_OP_POP || ctx.bc->cf_last->op == CF_OP_GDS)
3673				r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
3674
3675			ctx.bc->cf_last->end_of_program = 1;
3676		}
3677	}
3678
3679	/* check GPR limit - we have 124 = 128 - 4
3680	 * (4 are reserved as alu clause temporary registers) */
3681	if (ctx.bc->ngpr > 124) {
3682		R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
3683		r = -ENOMEM;
3684		goto out_err;
3685	}
3686
3687	if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
3688		if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
3689			return r;
3690	}
3691
3692	free(ctx.literals);
3693	tgsi_parse_free(&ctx.parse);
3694	return 0;
3695out_err:
3696	free(ctx.literals);
3697	tgsi_parse_free(&ctx.parse);
3698	return r;
3699}
3700
3701static int tgsi_unsupported(struct r600_shader_ctx *ctx)
3702{
3703	const unsigned tgsi_opcode =
3704		ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
3705	R600_ERR("%s tgsi opcode unsupported\n",
3706		 tgsi_get_opcode_name(tgsi_opcode));
3707	return -EINVAL;
3708}
3709
3710static int tgsi_end(struct r600_shader_ctx *ctx)
3711{
3712	return 0;
3713}
3714
3715static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
3716			const struct r600_shader_src *shader_src,
3717			unsigned chan)
3718{
3719	bc_src->sel = shader_src->sel;
3720	bc_src->chan = shader_src->swizzle[chan];
3721	bc_src->neg = shader_src->neg;
3722	bc_src->abs = shader_src->abs;
3723	bc_src->rel = shader_src->rel;
3724	bc_src->value = shader_src->value[bc_src->chan];
3725	bc_src->kc_bank = shader_src->kc_bank;
3726	bc_src->kc_rel = shader_src->kc_rel;
3727}
3728
3729static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
3730{
3731	bc_src->abs = 1;
3732	bc_src->neg = 0;
3733}
3734
3735static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
3736{
3737	bc_src->neg = !bc_src->neg;
3738}
3739
3740static void tgsi_dst(struct r600_shader_ctx *ctx,
3741		     const struct tgsi_full_dst_register *tgsi_dst,
3742		     unsigned swizzle,
3743		     struct r600_bytecode_alu_dst *r600_dst)
3744{
3745	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3746
3747	r600_dst->sel = tgsi_dst->Register.Index;
3748	r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
3749	r600_dst->chan = swizzle;
3750	r600_dst->write = 1;
3751	if (inst->Instruction.Saturate) {
3752		r600_dst->clamp = 1;
3753	}
3754	if (ctx->type == TGSI_PROCESSOR_TESS_CTRL) {
3755		if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) {
3756			return;
3757		}
3758	}
3759	if (tgsi_dst->Register.Indirect)
3760		r600_dst->rel = V_SQ_REL_RELATIVE;
3761
3762}
3763
3764static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap)
3765{
3766	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3767	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3768	struct r600_bytecode_alu alu;
3769	int i, j, r, lasti = tgsi_last_instruction(write_mask);
3770	int use_tmp = 0;
3771
3772	if (singledest) {
3773		switch (write_mask) {
3774		case 0x1:
3775			write_mask = 0x3;
3776			break;
3777		case 0x2:
3778			use_tmp = 1;
3779			write_mask = 0x3;
3780			break;
3781		case 0x4:
3782			write_mask = 0xc;
3783			break;
3784		case 0x8:
3785			write_mask = 0xc;
3786			use_tmp = 3;
3787			break;
3788		}
3789	}
3790
3791	lasti = tgsi_last_instruction(write_mask);
3792	for (i = 0; i <= lasti; i++) {
3793
3794		if (!(write_mask & (1 << i)))
3795			continue;
3796
3797		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3798
3799		if (singledest) {
3800			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3801			if (use_tmp) {
3802				alu.dst.sel = ctx->temp_reg;
3803				alu.dst.chan = i;
3804				alu.dst.write = 1;
3805			}
3806			if (i == 1 || i == 3)
3807				alu.dst.write = 0;
3808		} else
3809			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3810
3811		alu.op = ctx->inst_info->op;
3812		if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
3813			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3814		} else if (!swap) {
3815			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3816				r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
3817			}
3818		} else {
3819			r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
3820			r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
3821		}
3822
3823		/* handle some special cases */
3824		if (i == 1 || i == 3) {
3825			switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
3826			case TGSI_OPCODE_SUB:
3827				r600_bytecode_src_toggle_neg(&alu.src[1]);
3828				break;
3829			case TGSI_OPCODE_DABS:
3830				r600_bytecode_src_set_abs(&alu.src[0]);
3831				break;
3832			default:
3833				break;
3834			}
3835		}
3836		if (i == lasti) {
3837			alu.last = 1;
3838		}
3839		r = r600_bytecode_add_alu(ctx->bc, &alu);
3840		if (r)
3841			return r;
3842	}
3843
3844	if (use_tmp) {
3845		write_mask = inst->Dst[0].Register.WriteMask;
3846
3847		/* move result from temp to dst */
3848		for (i = 0; i <= lasti; i++) {
3849			if (!(write_mask & (1 << i)))
3850				continue;
3851
3852			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3853			alu.op = ALU_OP1_MOV;
3854			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3855			alu.src[0].sel = ctx->temp_reg;
3856			alu.src[0].chan = use_tmp - 1;
3857			alu.last = (i == lasti);
3858
3859			r = r600_bytecode_add_alu(ctx->bc, &alu);
3860			if (r)
3861				return r;
3862		}
3863	}
3864	return 0;
3865}
3866
3867static int tgsi_op2_64(struct r600_shader_ctx *ctx)
3868{
3869	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3870	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3871	/* confirm writemasking */
3872	if ((write_mask & 0x3) != 0x3 &&
3873	    (write_mask & 0xc) != 0xc) {
3874		fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
3875		return -1;
3876	}
3877	return tgsi_op2_64_params(ctx, false, false);
3878}
3879
3880static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
3881{
3882	return tgsi_op2_64_params(ctx, true, false);
3883}
3884
3885static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
3886{
3887	return tgsi_op2_64_params(ctx, true, true);
3888}
3889
3890static int tgsi_op3_64(struct r600_shader_ctx *ctx)
3891{
3892	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3893	struct r600_bytecode_alu alu;
3894	int i, j, r;
3895	int lasti = 3;
3896	int tmp = r600_get_temp(ctx);
3897
3898	for (i = 0; i < lasti + 1; i++) {
3899
3900		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3901		alu.op = ctx->inst_info->op;
3902		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3903			r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
3904		}
3905
3906		if (inst->Dst[0].Register.WriteMask & (1 << i))
3907			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3908		else
3909			alu.dst.sel = tmp;
3910
3911		alu.dst.chan = i;
3912		alu.is_op3 = 1;
3913		if (i == lasti) {
3914			alu.last = 1;
3915		}
3916		r = r600_bytecode_add_alu(ctx->bc, &alu);
3917		if (r)
3918			return r;
3919	}
3920	return 0;
3921}
3922
3923static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
3924{
3925	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3926	struct r600_bytecode_alu alu;
3927	unsigned write_mask = inst->Dst[0].Register.WriteMask;
3928	int i, j, r, lasti = tgsi_last_instruction(write_mask);
3929	/* use temp register if trans_only and more than one dst component */
3930	int use_tmp = trans_only && (write_mask ^ (1 << lasti));
3931
3932	for (i = 0; i <= lasti; i++) {
3933		if (!(write_mask & (1 << i)))
3934			continue;
3935
3936		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3937		if (use_tmp) {
3938			alu.dst.sel = ctx->temp_reg;
3939			alu.dst.chan = i;
3940			alu.dst.write = 1;
3941		} else
3942			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3943
3944		alu.op = ctx->inst_info->op;
3945		if (!swap) {
3946			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3947				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
3948			}
3949		} else {
3950			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3951			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3952		}
3953		/* handle some special cases */
3954		switch (inst->Instruction.Opcode) {
3955		case TGSI_OPCODE_SUB:
3956			r600_bytecode_src_toggle_neg(&alu.src[1]);
3957			break;
3958		case TGSI_OPCODE_ABS:
3959			r600_bytecode_src_set_abs(&alu.src[0]);
3960			break;
3961		default:
3962			break;
3963		}
3964		if (i == lasti || trans_only) {
3965			alu.last = 1;
3966		}
3967		r = r600_bytecode_add_alu(ctx->bc, &alu);
3968		if (r)
3969			return r;
3970	}
3971
3972	if (use_tmp) {
3973		/* move result from temp to dst */
3974		for (i = 0; i <= lasti; i++) {
3975			if (!(write_mask & (1 << i)))
3976				continue;
3977
3978			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3979			alu.op = ALU_OP1_MOV;
3980			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3981			alu.src[0].sel = ctx->temp_reg;
3982			alu.src[0].chan = i;
3983			alu.last = (i == lasti);
3984
3985			r = r600_bytecode_add_alu(ctx->bc, &alu);
3986			if (r)
3987				return r;
3988		}
3989	}
3990	return 0;
3991}
3992
3993static int tgsi_op2(struct r600_shader_ctx *ctx)
3994{
3995	return tgsi_op2_s(ctx, 0, 0);
3996}
3997
3998static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
3999{
4000	return tgsi_op2_s(ctx, 1, 0);
4001}
4002
4003static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
4004{
4005	return tgsi_op2_s(ctx, 0, 1);
4006}
4007
4008static int tgsi_ineg(struct r600_shader_ctx *ctx)
4009{
4010	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4011	struct r600_bytecode_alu alu;
4012	int i, r;
4013	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4014
4015	for (i = 0; i < lasti + 1; i++) {
4016
4017		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4018			continue;
4019		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4020		alu.op = ctx->inst_info->op;
4021
4022		alu.src[0].sel = V_SQ_ALU_SRC_0;
4023
4024		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4025
4026		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4027
4028		if (i == lasti) {
4029			alu.last = 1;
4030		}
4031		r = r600_bytecode_add_alu(ctx->bc, &alu);
4032		if (r)
4033			return r;
4034	}
4035	return 0;
4036
4037}
4038
4039static int tgsi_dneg(struct r600_shader_ctx *ctx)
4040{
4041	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4042	struct r600_bytecode_alu alu;
4043	int i, r;
4044	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4045
4046	for (i = 0; i < lasti + 1; i++) {
4047
4048		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4049			continue;
4050		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4051		alu.op = ALU_OP1_MOV;
4052
4053		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4054
4055		if (i == 1 || i == 3)
4056			r600_bytecode_src_toggle_neg(&alu.src[0]);
4057		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4058
4059		if (i == lasti) {
4060			alu.last = 1;
4061		}
4062		r = r600_bytecode_add_alu(ctx->bc, &alu);
4063		if (r)
4064			return r;
4065	}
4066	return 0;
4067
4068}
4069
4070static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
4071{
4072	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4073	struct r600_bytecode_alu alu;
4074	unsigned write_mask = inst->Dst[0].Register.WriteMask;
4075	int i, j, r;
4076	int firsti = write_mask == 0xc ? 2 : 0;
4077
4078	for (i = 0; i <= 3; i++) {
4079		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4080		alu.op = ctx->inst_info->op;
4081
4082		alu.dst.sel = ctx->temp_reg;
4083		alu.dst.chan = i;
4084		alu.dst.write = 1;
4085		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4086			r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4087		}
4088
4089		if (i == 3)
4090			alu.last = 1;
4091
4092		r = r600_bytecode_add_alu(ctx->bc, &alu);
4093		if (r)
4094			return r;
4095	}
4096
4097	/* MOV first two channels to writemask dst0 */
4098	for (i = 0; i <= 1; i++) {
4099		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4100		alu.op = ALU_OP1_MOV;
4101		alu.src[0].chan = i + 2;
4102		alu.src[0].sel = ctx->temp_reg;
4103
4104		tgsi_dst(ctx, &inst->Dst[0], firsti + i, &alu.dst);
4105		alu.dst.write = (inst->Dst[0].Register.WriteMask >> (firsti + i)) & 1;
4106		alu.last = 1;
4107		r = r600_bytecode_add_alu(ctx->bc, &alu);
4108		if (r)
4109			return r;
4110	}
4111
4112	for (i = 0; i <= 3; i++) {
4113		if (inst->Dst[1].Register.WriteMask & (1 << i)) {
4114			/* MOV third channels to writemask dst1 */
4115			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4116			alu.op = ALU_OP1_MOV;
4117			alu.src[0].chan = 1;
4118			alu.src[0].sel = ctx->temp_reg;
4119
4120			tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
4121			alu.last = 1;
4122			r = r600_bytecode_add_alu(ctx->bc, &alu);
4123			if (r)
4124				return r;
4125			break;
4126		}
4127	}
4128	return 0;
4129}
4130
4131
4132static int egcm_int_to_double(struct r600_shader_ctx *ctx)
4133{
4134	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4135	struct r600_bytecode_alu alu;
4136	int i, r;
4137	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4138
4139	assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
4140		inst->Instruction.Opcode == TGSI_OPCODE_U2D);
4141
4142	for (i = 0; i <= (lasti+1)/2; i++) {
4143		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4144		alu.op = ctx->inst_info->op;
4145
4146		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4147		alu.dst.sel = ctx->temp_reg;
4148		alu.dst.chan = i;
4149		alu.dst.write = 1;
4150		alu.last = 1;
4151
4152		r = r600_bytecode_add_alu(ctx->bc, &alu);
4153		if (r)
4154			return r;
4155	}
4156
4157	for (i = 0; i <= lasti; i++) {
4158		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4159		alu.op = ALU_OP1_FLT32_TO_FLT64;
4160
4161		alu.src[0].chan = i/2;
4162		if (i%2 == 0)
4163			alu.src[0].sel = ctx->temp_reg;
4164		else {
4165			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
4166			alu.src[0].value = 0x0;
4167		}
4168		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4169		alu.last = i == lasti;
4170
4171		r = r600_bytecode_add_alu(ctx->bc, &alu);
4172		if (r)
4173			return r;
4174	}
4175
4176	return 0;
4177}
4178
4179static int egcm_double_to_int(struct r600_shader_ctx *ctx)
4180{
4181	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4182	struct r600_bytecode_alu alu;
4183	int i, r;
4184	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4185
4186	assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I ||
4187		inst->Instruction.Opcode == TGSI_OPCODE_D2U);
4188
4189	for (i = 0; i <= lasti; i++) {
4190		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4191		alu.op = ALU_OP1_FLT64_TO_FLT32;
4192
4193		r600_bytecode_src(&alu.src[0], &ctx->src[0], fp64_switch(i));
4194		alu.dst.chan = i;
4195		alu.dst.sel = ctx->temp_reg;
4196		alu.dst.write = i%2 == 0;
4197		alu.last = i == lasti;
4198
4199		r = r600_bytecode_add_alu(ctx->bc, &alu);
4200		if (r)
4201			return r;
4202	}
4203
4204	for (i = 0; i <= (lasti+1)/2; i++) {
4205		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4206		alu.op = ctx->inst_info->op;
4207
4208		alu.src[0].chan = i*2;
4209		alu.src[0].sel = ctx->temp_reg;
4210		tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
4211		alu.last = 1;
4212
4213		r = r600_bytecode_add_alu(ctx->bc, &alu);
4214		if (r)
4215			return r;
4216	}
4217
4218	return 0;
4219}
4220
4221static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
4222{
4223	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4224	int i, r;
4225	struct r600_bytecode_alu alu;
4226	int last_slot = 3;
4227	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4228	int t1 = ctx->temp_reg;
4229
4230	/* these have to write the result to X/Y by the looks of it */
4231	for (i = 0 ; i < last_slot; i++) {
4232		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4233		alu.op = ctx->inst_info->op;
4234
4235		/* should only be one src regs */
4236		assert (inst->Instruction.NumSrcRegs == 1);
4237
4238		r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
4239		r600_bytecode_src(&alu.src[1], &ctx->src[0], 0);
4240
4241		/* RSQ should take the absolute value of src */
4242		if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
4243		    ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT) {
4244			r600_bytecode_src_set_abs(&alu.src[1]);
4245		}
4246		alu.dst.sel = t1;
4247		alu.dst.chan = i;
4248		alu.dst.write = (i == 0 || i == 1);
4249
4250		if (ctx->bc->chip_class != CAYMAN || i == last_slot - 1)
4251			alu.last = 1;
4252		r = r600_bytecode_add_alu(ctx->bc, &alu);
4253		if (r)
4254			return r;
4255	}
4256
4257	for (i = 0 ; i <= lasti; i++) {
4258		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4259			continue;
4260		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4261		alu.op = ALU_OP1_MOV;
4262		alu.src[0].sel = t1;
4263		alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
4264		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4265		alu.dst.write = 1;
4266		if (i == lasti)
4267			alu.last = 1;
4268		r = r600_bytecode_add_alu(ctx->bc, &alu);
4269		if (r)
4270			return r;
4271	}
4272	return 0;
4273}
4274
4275static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
4276{
4277	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4278	int i, j, r;
4279	struct r600_bytecode_alu alu;
4280	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
4281
4282	for (i = 0 ; i < last_slot; i++) {
4283		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4284		alu.op = ctx->inst_info->op;
4285		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4286			r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
4287
4288			/* RSQ should take the absolute value of src */
4289			if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
4290				r600_bytecode_src_set_abs(&alu.src[j]);
4291			}
4292		}
4293		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4294		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4295
4296		if (i == last_slot - 1)
4297			alu.last = 1;
4298		r = r600_bytecode_add_alu(ctx->bc, &alu);
4299		if (r)
4300			return r;
4301	}
4302	return 0;
4303}
4304
4305static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
4306{
4307	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4308	int i, j, k, r;
4309	struct r600_bytecode_alu alu;
4310	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4311	int t1 = ctx->temp_reg;
4312
4313	for (k = 0; k <= lasti; k++) {
4314		if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
4315			continue;
4316
4317		for (i = 0 ; i < 4; i++) {
4318			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4319			alu.op = ctx->inst_info->op;
4320			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4321				r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
4322			}
4323			alu.dst.sel = t1;
4324			alu.dst.chan = i;
4325			alu.dst.write = (i == k);
4326			if (i == 3)
4327				alu.last = 1;
4328			r = r600_bytecode_add_alu(ctx->bc, &alu);
4329			if (r)
4330				return r;
4331		}
4332	}
4333
4334	for (i = 0 ; i <= lasti; i++) {
4335		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4336			continue;
4337		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4338		alu.op = ALU_OP1_MOV;
4339		alu.src[0].sel = t1;
4340		alu.src[0].chan = i;
4341		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4342		alu.dst.write = 1;
4343		if (i == lasti)
4344			alu.last = 1;
4345		r = r600_bytecode_add_alu(ctx->bc, &alu);
4346		if (r)
4347			return r;
4348	}
4349
4350	return 0;
4351}
4352
4353
4354static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
4355{
4356	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4357	int i, j, k, r;
4358	struct r600_bytecode_alu alu;
4359	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4360	int t1 = ctx->temp_reg;
4361
4362	for (k = 0; k < 2; k++) {
4363		if (!(inst->Dst[0].Register.WriteMask & (0x3 << (k * 2))))
4364			continue;
4365
4366		for (i = 0; i < 4; i++) {
4367			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4368			alu.op = ctx->inst_info->op;
4369			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4370				r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));;
4371			}
4372			alu.dst.sel = t1;
4373			alu.dst.chan = i;
4374			alu.dst.write = 1;
4375			if (i == 3)
4376				alu.last = 1;
4377			r = r600_bytecode_add_alu(ctx->bc, &alu);
4378			if (r)
4379				return r;
4380		}
4381	}
4382
4383	for (i = 0; i <= lasti; i++) {
4384		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4385			continue;
4386		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4387		alu.op = ALU_OP1_MOV;
4388		alu.src[0].sel = t1;
4389		alu.src[0].chan = i;
4390		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4391		alu.dst.write = 1;
4392		if (i == lasti)
4393			alu.last = 1;
4394		r = r600_bytecode_add_alu(ctx->bc, &alu);
4395		if (r)
4396			return r;
4397	}
4398
4399	return 0;
4400}
4401
4402/*
4403 * r600 - trunc to -PI..PI range
4404 * r700 - normalize by dividing by 2PI
4405 * see fdo bug 27901
4406 */
4407static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
4408{
4409	static float half_inv_pi = 1.0 /(3.1415926535 * 2);
4410	static float double_pi = 3.1415926535 * 2;
4411	static float neg_pi = -3.1415926535;
4412
4413	int r;
4414	struct r600_bytecode_alu alu;
4415
4416	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4417	alu.op = ALU_OP3_MULADD;
4418	alu.is_op3 = 1;
4419
4420	alu.dst.chan = 0;
4421	alu.dst.sel = ctx->temp_reg;
4422	alu.dst.write = 1;
4423
4424	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4425
4426	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4427	alu.src[1].chan = 0;
4428	alu.src[1].value = *(uint32_t *)&half_inv_pi;
4429	alu.src[2].sel = V_SQ_ALU_SRC_0_5;
4430	alu.src[2].chan = 0;
4431	alu.last = 1;
4432	r = r600_bytecode_add_alu(ctx->bc, &alu);
4433	if (r)
4434		return r;
4435
4436	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4437	alu.op = ALU_OP1_FRACT;
4438
4439	alu.dst.chan = 0;
4440	alu.dst.sel = ctx->temp_reg;
4441	alu.dst.write = 1;
4442
4443	alu.src[0].sel = ctx->temp_reg;
4444	alu.src[0].chan = 0;
4445	alu.last = 1;
4446	r = r600_bytecode_add_alu(ctx->bc, &alu);
4447	if (r)
4448		return r;
4449
4450	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4451	alu.op = ALU_OP3_MULADD;
4452	alu.is_op3 = 1;
4453
4454	alu.dst.chan = 0;
4455	alu.dst.sel = ctx->temp_reg;
4456	alu.dst.write = 1;
4457
4458	alu.src[0].sel = ctx->temp_reg;
4459	alu.src[0].chan = 0;
4460
4461	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4462	alu.src[1].chan = 0;
4463	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
4464	alu.src[2].chan = 0;
4465
4466	if (ctx->bc->chip_class == R600) {
4467		alu.src[1].value = *(uint32_t *)&double_pi;
4468		alu.src[2].value = *(uint32_t *)&neg_pi;
4469	} else {
4470		alu.src[1].sel = V_SQ_ALU_SRC_1;
4471		alu.src[2].sel = V_SQ_ALU_SRC_0_5;
4472		alu.src[2].neg = 1;
4473	}
4474
4475	alu.last = 1;
4476	r = r600_bytecode_add_alu(ctx->bc, &alu);
4477	if (r)
4478		return r;
4479	return 0;
4480}
4481
4482static int cayman_trig(struct r600_shader_ctx *ctx)
4483{
4484	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4485	struct r600_bytecode_alu alu;
4486	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
4487	int i, r;
4488
4489	r = tgsi_setup_trig(ctx);
4490	if (r)
4491		return r;
4492
4493
4494	for (i = 0; i < last_slot; i++) {
4495		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4496		alu.op = ctx->inst_info->op;
4497		alu.dst.chan = i;
4498
4499		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4500		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4501
4502		alu.src[0].sel = ctx->temp_reg;
4503		alu.src[0].chan = 0;
4504		if (i == last_slot - 1)
4505			alu.last = 1;
4506		r = r600_bytecode_add_alu(ctx->bc, &alu);
4507		if (r)
4508			return r;
4509	}
4510	return 0;
4511}
4512
4513static int tgsi_trig(struct r600_shader_ctx *ctx)
4514{
4515	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4516	struct r600_bytecode_alu alu;
4517	int i, r;
4518	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4519
4520	r = tgsi_setup_trig(ctx);
4521	if (r)
4522		return r;
4523
4524	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4525	alu.op = ctx->inst_info->op;
4526	alu.dst.chan = 0;
4527	alu.dst.sel = ctx->temp_reg;
4528	alu.dst.write = 1;
4529
4530	alu.src[0].sel = ctx->temp_reg;
4531	alu.src[0].chan = 0;
4532	alu.last = 1;
4533	r = r600_bytecode_add_alu(ctx->bc, &alu);
4534	if (r)
4535		return r;
4536
4537	/* replicate result */
4538	for (i = 0; i < lasti + 1; i++) {
4539		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4540			continue;
4541
4542		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4543		alu.op = ALU_OP1_MOV;
4544
4545		alu.src[0].sel = ctx->temp_reg;
4546		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4547		if (i == lasti)
4548			alu.last = 1;
4549		r = r600_bytecode_add_alu(ctx->bc, &alu);
4550		if (r)
4551			return r;
4552	}
4553	return 0;
4554}
4555
4556static int tgsi_scs(struct r600_shader_ctx *ctx)
4557{
4558	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4559	struct r600_bytecode_alu alu;
4560	int i, r;
4561
4562	/* We'll only need the trig stuff if we are going to write to the
4563	 * X or Y components of the destination vector.
4564	 */
4565	if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) {
4566		r = tgsi_setup_trig(ctx);
4567		if (r)
4568			return r;
4569	}
4570
4571	/* dst.x = COS */
4572	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
4573		if (ctx->bc->chip_class == CAYMAN) {
4574			for (i = 0 ; i < 3; i++) {
4575				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4576				alu.op = ALU_OP1_COS;
4577				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4578
4579				if (i == 0)
4580					alu.dst.write = 1;
4581				else
4582					alu.dst.write = 0;
4583				alu.src[0].sel = ctx->temp_reg;
4584				alu.src[0].chan = 0;
4585				if (i == 2)
4586					alu.last = 1;
4587				r = r600_bytecode_add_alu(ctx->bc, &alu);
4588				if (r)
4589					return r;
4590			}
4591		} else {
4592			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4593			alu.op = ALU_OP1_COS;
4594			tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
4595
4596			alu.src[0].sel = ctx->temp_reg;
4597			alu.src[0].chan = 0;
4598			alu.last = 1;
4599			r = r600_bytecode_add_alu(ctx->bc, &alu);
4600			if (r)
4601				return r;
4602		}
4603	}
4604
4605	/* dst.y = SIN */
4606	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
4607		if (ctx->bc->chip_class == CAYMAN) {
4608			for (i = 0 ; i < 3; i++) {
4609				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4610				alu.op = ALU_OP1_SIN;
4611				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4612				if (i == 1)
4613					alu.dst.write = 1;
4614				else
4615					alu.dst.write = 0;
4616				alu.src[0].sel = ctx->temp_reg;
4617				alu.src[0].chan = 0;
4618				if (i == 2)
4619					alu.last = 1;
4620				r = r600_bytecode_add_alu(ctx->bc, &alu);
4621				if (r)
4622					return r;
4623			}
4624		} else {
4625			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4626			alu.op = ALU_OP1_SIN;
4627			tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
4628
4629			alu.src[0].sel = ctx->temp_reg;
4630			alu.src[0].chan = 0;
4631			alu.last = 1;
4632			r = r600_bytecode_add_alu(ctx->bc, &alu);
4633			if (r)
4634				return r;
4635		}
4636	}
4637
4638	/* dst.z = 0.0; */
4639	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
4640		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4641
4642		alu.op = ALU_OP1_MOV;
4643
4644		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
4645
4646		alu.src[0].sel = V_SQ_ALU_SRC_0;
4647		alu.src[0].chan = 0;
4648
4649		alu.last = 1;
4650
4651		r = r600_bytecode_add_alu(ctx->bc, &alu);
4652		if (r)
4653			return r;
4654	}
4655
4656	/* dst.w = 1.0; */
4657	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
4658		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4659
4660		alu.op = ALU_OP1_MOV;
4661
4662		tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
4663
4664		alu.src[0].sel = V_SQ_ALU_SRC_1;
4665		alu.src[0].chan = 0;
4666
4667		alu.last = 1;
4668
4669		r = r600_bytecode_add_alu(ctx->bc, &alu);
4670		if (r)
4671			return r;
4672	}
4673
4674	return 0;
4675}
4676
4677static int tgsi_kill(struct r600_shader_ctx *ctx)
4678{
4679	const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4680	struct r600_bytecode_alu alu;
4681	int i, r;
4682
4683	for (i = 0; i < 4; i++) {
4684		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4685		alu.op = ctx->inst_info->op;
4686
4687		alu.dst.chan = i;
4688
4689		alu.src[0].sel = V_SQ_ALU_SRC_0;
4690
4691		if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
4692			alu.src[1].sel = V_SQ_ALU_SRC_1;
4693			alu.src[1].neg = 1;
4694		} else {
4695			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4696		}
4697		if (i == 3) {
4698			alu.last = 1;
4699		}
4700		r = r600_bytecode_add_alu(ctx->bc, &alu);
4701		if (r)
4702			return r;
4703	}
4704
4705	/* kill must be last in ALU */
4706	ctx->bc->force_add_cf = 1;
4707	ctx->shader->uses_kill = TRUE;
4708	return 0;
4709}
4710
4711static int tgsi_lit(struct r600_shader_ctx *ctx)
4712{
4713	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4714	struct r600_bytecode_alu alu;
4715	int r;
4716
4717	/* tmp.x = max(src.y, 0.0) */
4718	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4719	alu.op = ALU_OP2_MAX;
4720	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
4721	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
4722	alu.src[1].chan = 1;
4723
4724	alu.dst.sel = ctx->temp_reg;
4725	alu.dst.chan = 0;
4726	alu.dst.write = 1;
4727
4728	alu.last = 1;
4729	r = r600_bytecode_add_alu(ctx->bc, &alu);
4730	if (r)
4731		return r;
4732
4733	if (inst->Dst[0].Register.WriteMask & (1 << 2))
4734	{
4735		int chan;
4736		int sel;
4737		int i;
4738
4739		if (ctx->bc->chip_class == CAYMAN) {
4740			for (i = 0; i < 3; i++) {
4741				/* tmp.z = log(tmp.x) */
4742				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4743				alu.op = ALU_OP1_LOG_CLAMPED;
4744				alu.src[0].sel = ctx->temp_reg;
4745				alu.src[0].chan = 0;
4746				alu.dst.sel = ctx->temp_reg;
4747				alu.dst.chan = i;
4748				if (i == 2) {
4749					alu.dst.write = 1;
4750					alu.last = 1;
4751				} else
4752					alu.dst.write = 0;
4753
4754				r = r600_bytecode_add_alu(ctx->bc, &alu);
4755				if (r)
4756					return r;
4757			}
4758		} else {
4759			/* tmp.z = log(tmp.x) */
4760			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4761			alu.op = ALU_OP1_LOG_CLAMPED;
4762			alu.src[0].sel = ctx->temp_reg;
4763			alu.src[0].chan = 0;
4764			alu.dst.sel = ctx->temp_reg;
4765			alu.dst.chan = 2;
4766			alu.dst.write = 1;
4767			alu.last = 1;
4768			r = r600_bytecode_add_alu(ctx->bc, &alu);
4769			if (r)
4770				return r;
4771		}
4772
4773		chan = alu.dst.chan;
4774		sel = alu.dst.sel;
4775
4776		/* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
4777		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4778		alu.op = ALU_OP3_MUL_LIT;
4779		alu.src[0].sel  = sel;
4780		alu.src[0].chan = chan;
4781		r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
4782		r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
4783		alu.dst.sel = ctx->temp_reg;
4784		alu.dst.chan = 0;
4785		alu.dst.write = 1;
4786		alu.is_op3 = 1;
4787		alu.last = 1;
4788		r = r600_bytecode_add_alu(ctx->bc, &alu);
4789		if (r)
4790			return r;
4791
4792		if (ctx->bc->chip_class == CAYMAN) {
4793			for (i = 0; i < 3; i++) {
4794				/* dst.z = exp(tmp.x) */
4795				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4796				alu.op = ALU_OP1_EXP_IEEE;
4797				alu.src[0].sel = ctx->temp_reg;
4798				alu.src[0].chan = 0;
4799				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4800				if (i == 2) {
4801					alu.dst.write = 1;
4802					alu.last = 1;
4803				} else
4804					alu.dst.write = 0;
4805				r = r600_bytecode_add_alu(ctx->bc, &alu);
4806				if (r)
4807					return r;
4808			}
4809		} else {
4810			/* dst.z = exp(tmp.x) */
4811			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4812			alu.op = ALU_OP1_EXP_IEEE;
4813			alu.src[0].sel = ctx->temp_reg;
4814			alu.src[0].chan = 0;
4815			tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
4816			alu.last = 1;
4817			r = r600_bytecode_add_alu(ctx->bc, &alu);
4818			if (r)
4819				return r;
4820		}
4821	}
4822
4823	/* dst.x, <- 1.0  */
4824	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4825	alu.op = ALU_OP1_MOV;
4826	alu.src[0].sel  = V_SQ_ALU_SRC_1; /*1.0*/
4827	alu.src[0].chan = 0;
4828	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
4829	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
4830	r = r600_bytecode_add_alu(ctx->bc, &alu);
4831	if (r)
4832		return r;
4833
4834	/* dst.y = max(src.x, 0.0) */
4835	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4836	alu.op = ALU_OP2_MAX;
4837	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4838	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
4839	alu.src[1].chan = 0;
4840	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
4841	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
4842	r = r600_bytecode_add_alu(ctx->bc, &alu);
4843	if (r)
4844		return r;
4845
4846	/* dst.w, <- 1.0  */
4847	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4848	alu.op = ALU_OP1_MOV;
4849	alu.src[0].sel  = V_SQ_ALU_SRC_1;
4850	alu.src[0].chan = 0;
4851	tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
4852	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
4853	alu.last = 1;
4854	r = r600_bytecode_add_alu(ctx->bc, &alu);
4855	if (r)
4856		return r;
4857
4858	return 0;
4859}
4860
4861static int tgsi_rsq(struct r600_shader_ctx *ctx)
4862{
4863	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4864	struct r600_bytecode_alu alu;
4865	int i, r;
4866
4867	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4868
4869	/* XXX:
4870	 * For state trackers other than OpenGL, we'll want to use
4871	 * _RECIPSQRT_IEEE instead.
4872	 */
4873	alu.op = ALU_OP1_RECIPSQRT_CLAMPED;
4874
4875	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
4876		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
4877		r600_bytecode_src_set_abs(&alu.src[i]);
4878	}
4879	alu.dst.sel = ctx->temp_reg;
4880	alu.dst.write = 1;
4881	alu.last = 1;
4882	r = r600_bytecode_add_alu(ctx->bc, &alu);
4883	if (r)
4884		return r;
4885	/* replicate result */
4886	return tgsi_helper_tempx_replicate(ctx);
4887}
4888
4889static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
4890{
4891	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4892	struct r600_bytecode_alu alu;
4893	int i, r;
4894
4895	for (i = 0; i < 4; i++) {
4896		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4897		alu.src[0].sel = ctx->temp_reg;
4898		alu.op = ALU_OP1_MOV;
4899		alu.dst.chan = i;
4900		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4901		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4902		if (i == 3)
4903			alu.last = 1;
4904		r = r600_bytecode_add_alu(ctx->bc, &alu);
4905		if (r)
4906			return r;
4907	}
4908	return 0;
4909}
4910
4911static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
4912{
4913	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4914	struct r600_bytecode_alu alu;
4915	int i, r;
4916
4917	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4918	alu.op = ctx->inst_info->op;
4919	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
4920		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
4921	}
4922	alu.dst.sel = ctx->temp_reg;
4923	alu.dst.write = 1;
4924	alu.last = 1;
4925	r = r600_bytecode_add_alu(ctx->bc, &alu);
4926	if (r)
4927		return r;
4928	/* replicate result */
4929	return tgsi_helper_tempx_replicate(ctx);
4930}
4931
4932static int cayman_pow(struct r600_shader_ctx *ctx)
4933{
4934	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4935	int i, r;
4936	struct r600_bytecode_alu alu;
4937	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
4938
4939	for (i = 0; i < 3; i++) {
4940		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4941		alu.op = ALU_OP1_LOG_IEEE;
4942		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4943		alu.dst.sel = ctx->temp_reg;
4944		alu.dst.chan = i;
4945		alu.dst.write = 1;
4946		if (i == 2)
4947			alu.last = 1;
4948		r = r600_bytecode_add_alu(ctx->bc, &alu);
4949		if (r)
4950			return r;
4951	}
4952
4953	/* b * LOG2(a) */
4954	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4955	alu.op = ALU_OP2_MUL;
4956	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
4957	alu.src[1].sel = ctx->temp_reg;
4958	alu.dst.sel = ctx->temp_reg;
4959	alu.dst.write = 1;
4960	alu.last = 1;
4961	r = r600_bytecode_add_alu(ctx->bc, &alu);
4962	if (r)
4963		return r;
4964
4965	for (i = 0; i < last_slot; i++) {
4966		/* POW(a,b) = EXP2(b * LOG2(a))*/
4967		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4968		alu.op = ALU_OP1_EXP_IEEE;
4969		alu.src[0].sel = ctx->temp_reg;
4970
4971		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4972		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4973		if (i == last_slot - 1)
4974			alu.last = 1;
4975		r = r600_bytecode_add_alu(ctx->bc, &alu);
4976		if (r)
4977			return r;
4978	}
4979	return 0;
4980}
4981
4982static int tgsi_pow(struct r600_shader_ctx *ctx)
4983{
4984	struct r600_bytecode_alu alu;
4985	int r;
4986
4987	/* LOG2(a) */
4988	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4989	alu.op = ALU_OP1_LOG_IEEE;
4990	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4991	alu.dst.sel = ctx->temp_reg;
4992	alu.dst.write = 1;
4993	alu.last = 1;
4994	r = r600_bytecode_add_alu(ctx->bc, &alu);
4995	if (r)
4996		return r;
4997	/* b * LOG2(a) */
4998	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4999	alu.op = ALU_OP2_MUL;
5000	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5001	alu.src[1].sel = ctx->temp_reg;
5002	alu.dst.sel = ctx->temp_reg;
5003	alu.dst.write = 1;
5004	alu.last = 1;
5005	r = r600_bytecode_add_alu(ctx->bc, &alu);
5006	if (r)
5007		return r;
5008	/* POW(a,b) = EXP2(b * LOG2(a))*/
5009	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5010	alu.op = ALU_OP1_EXP_IEEE;
5011	alu.src[0].sel = ctx->temp_reg;
5012	alu.dst.sel = ctx->temp_reg;
5013	alu.dst.write = 1;
5014	alu.last = 1;
5015	r = r600_bytecode_add_alu(ctx->bc, &alu);
5016	if (r)
5017		return r;
5018	return tgsi_helper_tempx_replicate(ctx);
5019}
5020
5021static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
5022{
5023	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5024	struct r600_bytecode_alu alu;
5025	int i, r, j;
5026	unsigned write_mask = inst->Dst[0].Register.WriteMask;
5027	int tmp0 = ctx->temp_reg;
5028	int tmp1 = r600_get_temp(ctx);
5029	int tmp2 = r600_get_temp(ctx);
5030	int tmp3 = r600_get_temp(ctx);
5031	/* Unsigned path:
5032	 *
5033	 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
5034	 *
5035	 * 1. tmp0.x = rcp (src2)     = 2^32/src2 + e, where e is rounding error
5036	 * 2. tmp0.z = lo (tmp0.x * src2)
5037	 * 3. tmp0.w = -tmp0.z
5038	 * 4. tmp0.y = hi (tmp0.x * src2)
5039	 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src2))
5040	 * 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error
5041	 * 7. tmp1.x = tmp0.x - tmp0.w
5042	 * 8. tmp1.y = tmp0.x + tmp0.w
5043	 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
5044	 * 10. tmp0.z = hi(tmp0.x * src1)     = q
5045	 * 11. tmp0.y = lo (tmp0.z * src2)     = src2*q = src1 - r
5046	 *
5047	 * 12. tmp0.w = src1 - tmp0.y       = r
5048	 * 13. tmp1.x = tmp0.w >= src2		= r >= src2 (uint comparison)
5049	 * 14. tmp1.y = src1 >= tmp0.y      = r >= 0 (uint comparison)
5050	 *
5051	 * if DIV
5052	 *
5053	 *   15. tmp1.z = tmp0.z + 1			= q + 1
5054	 *   16. tmp1.w = tmp0.z - 1			= q - 1
5055	 *
5056	 * else MOD
5057	 *
5058	 *   15. tmp1.z = tmp0.w - src2			= r - src2
5059	 *   16. tmp1.w = tmp0.w + src2			= r + src2
5060	 *
5061	 * endif
5062	 *
5063	 * 17. tmp1.x = tmp1.x & tmp1.y
5064	 *
5065	 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
5066	 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
5067	 *
5068	 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
5069	 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
5070	 *
5071	 * Signed path:
5072	 *
5073	 * Same as unsigned, using abs values of the operands,
5074	 * and fixing the sign of the result in the end.
5075	 */
5076
5077	for (i = 0; i < 4; i++) {
5078		if (!(write_mask & (1<<i)))
5079			continue;
5080
5081		if (signed_op) {
5082
5083			/* tmp2.x = -src0 */
5084			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5085			alu.op = ALU_OP2_SUB_INT;
5086
5087			alu.dst.sel = tmp2;
5088			alu.dst.chan = 0;
5089			alu.dst.write = 1;
5090
5091			alu.src[0].sel = V_SQ_ALU_SRC_0;
5092
5093			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5094
5095			alu.last = 1;
5096			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5097				return r;
5098
5099			/* tmp2.y = -src1 */
5100			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5101			alu.op = ALU_OP2_SUB_INT;
5102
5103			alu.dst.sel = tmp2;
5104			alu.dst.chan = 1;
5105			alu.dst.write = 1;
5106
5107			alu.src[0].sel = V_SQ_ALU_SRC_0;
5108
5109			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5110
5111			alu.last = 1;
5112			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5113				return r;
5114
5115			/* tmp2.z sign bit is set if src0 and src2 signs are different */
5116			/* it will be a sign of the quotient */
5117			if (!mod) {
5118
5119				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5120				alu.op = ALU_OP2_XOR_INT;
5121
5122				alu.dst.sel = tmp2;
5123				alu.dst.chan = 2;
5124				alu.dst.write = 1;
5125
5126				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5127				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5128
5129				alu.last = 1;
5130				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5131					return r;
5132			}
5133
5134			/* tmp2.x = |src0| */
5135			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5136			alu.op = ALU_OP3_CNDGE_INT;
5137			alu.is_op3 = 1;
5138
5139			alu.dst.sel = tmp2;
5140			alu.dst.chan = 0;
5141			alu.dst.write = 1;
5142
5143			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5144			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5145			alu.src[2].sel = tmp2;
5146			alu.src[2].chan = 0;
5147
5148			alu.last = 1;
5149			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5150				return r;
5151
5152			/* tmp2.y = |src1| */
5153			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5154			alu.op = ALU_OP3_CNDGE_INT;
5155			alu.is_op3 = 1;
5156
5157			alu.dst.sel = tmp2;
5158			alu.dst.chan = 1;
5159			alu.dst.write = 1;
5160
5161			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5162			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5163			alu.src[2].sel = tmp2;
5164			alu.src[2].chan = 1;
5165
5166			alu.last = 1;
5167			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5168				return r;
5169
5170		}
5171
5172		/* 1. tmp0.x = rcp_u (src2)     = 2^32/src2 + e, where e is rounding error */
5173		if (ctx->bc->chip_class == CAYMAN) {
5174			/* tmp3.x = u2f(src2) */
5175			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5176			alu.op = ALU_OP1_UINT_TO_FLT;
5177
5178			alu.dst.sel = tmp3;
5179			alu.dst.chan = 0;
5180			alu.dst.write = 1;
5181
5182			if (signed_op) {
5183				alu.src[0].sel = tmp2;
5184				alu.src[0].chan = 1;
5185			} else {
5186				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5187			}
5188
5189			alu.last = 1;
5190			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5191				return r;
5192
5193			/* tmp0.x = recip(tmp3.x) */
5194			for (j = 0 ; j < 3; j++) {
5195				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5196				alu.op = ALU_OP1_RECIP_IEEE;
5197
5198				alu.dst.sel = tmp0;
5199				alu.dst.chan = j;
5200				alu.dst.write = (j == 0);
5201
5202				alu.src[0].sel = tmp3;
5203				alu.src[0].chan = 0;
5204
5205				if (j == 2)
5206					alu.last = 1;
5207				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5208					return r;
5209			}
5210
5211			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5212			alu.op = ALU_OP2_MUL;
5213
5214			alu.src[0].sel = tmp0;
5215			alu.src[0].chan = 0;
5216
5217			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5218			alu.src[1].value = 0x4f800000;
5219
5220			alu.dst.sel = tmp3;
5221			alu.dst.write = 1;
5222			alu.last = 1;
5223			r = r600_bytecode_add_alu(ctx->bc, &alu);
5224			if (r)
5225				return r;
5226
5227			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5228			alu.op = ALU_OP1_FLT_TO_UINT;
5229
5230			alu.dst.sel = tmp0;
5231			alu.dst.chan = 0;
5232			alu.dst.write = 1;
5233
5234			alu.src[0].sel = tmp3;
5235			alu.src[0].chan = 0;
5236
5237			alu.last = 1;
5238			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5239				return r;
5240
5241		} else {
5242			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5243			alu.op = ALU_OP1_RECIP_UINT;
5244
5245			alu.dst.sel = tmp0;
5246			alu.dst.chan = 0;
5247			alu.dst.write = 1;
5248
5249			if (signed_op) {
5250				alu.src[0].sel = tmp2;
5251				alu.src[0].chan = 1;
5252			} else {
5253				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5254			}
5255
5256			alu.last = 1;
5257			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5258				return r;
5259		}
5260
5261		/* 2. tmp0.z = lo (tmp0.x * src2) */
5262		if (ctx->bc->chip_class == CAYMAN) {
5263			for (j = 0 ; j < 4; j++) {
5264				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5265				alu.op = ALU_OP2_MULLO_UINT;
5266
5267				alu.dst.sel = tmp0;
5268				alu.dst.chan = j;
5269				alu.dst.write = (j == 2);
5270
5271				alu.src[0].sel = tmp0;
5272				alu.src[0].chan = 0;
5273				if (signed_op) {
5274					alu.src[1].sel = tmp2;
5275					alu.src[1].chan = 1;
5276				} else {
5277					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5278				}
5279
5280				alu.last = (j == 3);
5281				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5282					return r;
5283			}
5284		} else {
5285			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5286			alu.op = ALU_OP2_MULLO_UINT;
5287
5288			alu.dst.sel = tmp0;
5289			alu.dst.chan = 2;
5290			alu.dst.write = 1;
5291
5292			alu.src[0].sel = tmp0;
5293			alu.src[0].chan = 0;
5294			if (signed_op) {
5295				alu.src[1].sel = tmp2;
5296				alu.src[1].chan = 1;
5297			} else {
5298				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5299			}
5300
5301			alu.last = 1;
5302			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5303				return r;
5304		}
5305
5306		/* 3. tmp0.w = -tmp0.z */
5307		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5308		alu.op = ALU_OP2_SUB_INT;
5309
5310		alu.dst.sel = tmp0;
5311		alu.dst.chan = 3;
5312		alu.dst.write = 1;
5313
5314		alu.src[0].sel = V_SQ_ALU_SRC_0;
5315		alu.src[1].sel = tmp0;
5316		alu.src[1].chan = 2;
5317
5318		alu.last = 1;
5319		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5320			return r;
5321
5322		/* 4. tmp0.y = hi (tmp0.x * src2) */
5323		if (ctx->bc->chip_class == CAYMAN) {
5324			for (j = 0 ; j < 4; j++) {
5325				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5326				alu.op = ALU_OP2_MULHI_UINT;
5327
5328				alu.dst.sel = tmp0;
5329				alu.dst.chan = j;
5330				alu.dst.write = (j == 1);
5331
5332				alu.src[0].sel = tmp0;
5333				alu.src[0].chan = 0;
5334
5335				if (signed_op) {
5336					alu.src[1].sel = tmp2;
5337					alu.src[1].chan = 1;
5338				} else {
5339					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5340				}
5341				alu.last = (j == 3);
5342				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5343					return r;
5344			}
5345		} else {
5346			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5347			alu.op = ALU_OP2_MULHI_UINT;
5348
5349			alu.dst.sel = tmp0;
5350			alu.dst.chan = 1;
5351			alu.dst.write = 1;
5352
5353			alu.src[0].sel = tmp0;
5354			alu.src[0].chan = 0;
5355
5356			if (signed_op) {
5357				alu.src[1].sel = tmp2;
5358				alu.src[1].chan = 1;
5359			} else {
5360				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5361			}
5362
5363			alu.last = 1;
5364			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5365				return r;
5366		}
5367
5368		/* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src)) */
5369		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5370		alu.op = ALU_OP3_CNDE_INT;
5371		alu.is_op3 = 1;
5372
5373		alu.dst.sel = tmp0;
5374		alu.dst.chan = 2;
5375		alu.dst.write = 1;
5376
5377		alu.src[0].sel = tmp0;
5378		alu.src[0].chan = 1;
5379		alu.src[1].sel = tmp0;
5380		alu.src[1].chan = 3;
5381		alu.src[2].sel = tmp0;
5382		alu.src[2].chan = 2;
5383
5384		alu.last = 1;
5385		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5386			return r;
5387
5388		/* 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error */
5389		if (ctx->bc->chip_class == CAYMAN) {
5390			for (j = 0 ; j < 4; j++) {
5391				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5392				alu.op = ALU_OP2_MULHI_UINT;
5393
5394				alu.dst.sel = tmp0;
5395				alu.dst.chan = j;
5396				alu.dst.write = (j == 3);
5397
5398				alu.src[0].sel = tmp0;
5399				alu.src[0].chan = 2;
5400
5401				alu.src[1].sel = tmp0;
5402				alu.src[1].chan = 0;
5403
5404				alu.last = (j == 3);
5405				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5406					return r;
5407			}
5408		} else {
5409			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5410			alu.op = ALU_OP2_MULHI_UINT;
5411
5412			alu.dst.sel = tmp0;
5413			alu.dst.chan = 3;
5414			alu.dst.write = 1;
5415
5416			alu.src[0].sel = tmp0;
5417			alu.src[0].chan = 2;
5418
5419			alu.src[1].sel = tmp0;
5420			alu.src[1].chan = 0;
5421
5422			alu.last = 1;
5423			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5424				return r;
5425		}
5426
5427		/* 7. tmp1.x = tmp0.x - tmp0.w */
5428		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5429		alu.op = ALU_OP2_SUB_INT;
5430
5431		alu.dst.sel = tmp1;
5432		alu.dst.chan = 0;
5433		alu.dst.write = 1;
5434
5435		alu.src[0].sel = tmp0;
5436		alu.src[0].chan = 0;
5437		alu.src[1].sel = tmp0;
5438		alu.src[1].chan = 3;
5439
5440		alu.last = 1;
5441		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5442			return r;
5443
5444		/* 8. tmp1.y = tmp0.x + tmp0.w */
5445		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5446		alu.op = ALU_OP2_ADD_INT;
5447
5448		alu.dst.sel = tmp1;
5449		alu.dst.chan = 1;
5450		alu.dst.write = 1;
5451
5452		alu.src[0].sel = tmp0;
5453		alu.src[0].chan = 0;
5454		alu.src[1].sel = tmp0;
5455		alu.src[1].chan = 3;
5456
5457		alu.last = 1;
5458		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5459			return r;
5460
5461		/* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
5462		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5463		alu.op = ALU_OP3_CNDE_INT;
5464		alu.is_op3 = 1;
5465
5466		alu.dst.sel = tmp0;
5467		alu.dst.chan = 0;
5468		alu.dst.write = 1;
5469
5470		alu.src[0].sel = tmp0;
5471		alu.src[0].chan = 1;
5472		alu.src[1].sel = tmp1;
5473		alu.src[1].chan = 1;
5474		alu.src[2].sel = tmp1;
5475		alu.src[2].chan = 0;
5476
5477		alu.last = 1;
5478		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5479			return r;
5480
5481		/* 10. tmp0.z = hi(tmp0.x * src1)     = q */
5482		if (ctx->bc->chip_class == CAYMAN) {
5483			for (j = 0 ; j < 4; j++) {
5484				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5485				alu.op = ALU_OP2_MULHI_UINT;
5486
5487				alu.dst.sel = tmp0;
5488				alu.dst.chan = j;
5489				alu.dst.write = (j == 2);
5490
5491				alu.src[0].sel = tmp0;
5492				alu.src[0].chan = 0;
5493
5494				if (signed_op) {
5495					alu.src[1].sel = tmp2;
5496					alu.src[1].chan = 0;
5497				} else {
5498					r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5499				}
5500
5501				alu.last = (j == 3);
5502				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5503					return r;
5504			}
5505		} else {
5506			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5507			alu.op = ALU_OP2_MULHI_UINT;
5508
5509			alu.dst.sel = tmp0;
5510			alu.dst.chan = 2;
5511			alu.dst.write = 1;
5512
5513			alu.src[0].sel = tmp0;
5514			alu.src[0].chan = 0;
5515
5516			if (signed_op) {
5517				alu.src[1].sel = tmp2;
5518				alu.src[1].chan = 0;
5519			} else {
5520				r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5521			}
5522
5523			alu.last = 1;
5524			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5525				return r;
5526		}
5527
5528		/* 11. tmp0.y = lo (src2 * tmp0.z)     = src2*q = src1 - r */
5529		if (ctx->bc->chip_class == CAYMAN) {
5530			for (j = 0 ; j < 4; j++) {
5531				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5532				alu.op = ALU_OP2_MULLO_UINT;
5533
5534				alu.dst.sel = tmp0;
5535				alu.dst.chan = j;
5536				alu.dst.write = (j == 1);
5537
5538				if (signed_op) {
5539					alu.src[0].sel = tmp2;
5540					alu.src[0].chan = 1;
5541				} else {
5542					r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5543				}
5544
5545				alu.src[1].sel = tmp0;
5546				alu.src[1].chan = 2;
5547
5548				alu.last = (j == 3);
5549				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5550					return r;
5551			}
5552		} else {
5553			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5554			alu.op = ALU_OP2_MULLO_UINT;
5555
5556			alu.dst.sel = tmp0;
5557			alu.dst.chan = 1;
5558			alu.dst.write = 1;
5559
5560			if (signed_op) {
5561				alu.src[0].sel = tmp2;
5562				alu.src[0].chan = 1;
5563			} else {
5564				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5565			}
5566
5567			alu.src[1].sel = tmp0;
5568			alu.src[1].chan = 2;
5569
5570			alu.last = 1;
5571			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5572				return r;
5573		}
5574
5575		/* 12. tmp0.w = src1 - tmp0.y       = r */
5576		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5577		alu.op = ALU_OP2_SUB_INT;
5578
5579		alu.dst.sel = tmp0;
5580		alu.dst.chan = 3;
5581		alu.dst.write = 1;
5582
5583		if (signed_op) {
5584			alu.src[0].sel = tmp2;
5585			alu.src[0].chan = 0;
5586		} else {
5587			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5588		}
5589
5590		alu.src[1].sel = tmp0;
5591		alu.src[1].chan = 1;
5592
5593		alu.last = 1;
5594		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5595			return r;
5596
5597		/* 13. tmp1.x = tmp0.w >= src2		= r >= src2 */
5598		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5599		alu.op = ALU_OP2_SETGE_UINT;
5600
5601		alu.dst.sel = tmp1;
5602		alu.dst.chan = 0;
5603		alu.dst.write = 1;
5604
5605		alu.src[0].sel = tmp0;
5606		alu.src[0].chan = 3;
5607		if (signed_op) {
5608			alu.src[1].sel = tmp2;
5609			alu.src[1].chan = 1;
5610		} else {
5611			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5612		}
5613
5614		alu.last = 1;
5615		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5616			return r;
5617
5618		/* 14. tmp1.y = src1 >= tmp0.y       = r >= 0 */
5619		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5620		alu.op = ALU_OP2_SETGE_UINT;
5621
5622		alu.dst.sel = tmp1;
5623		alu.dst.chan = 1;
5624		alu.dst.write = 1;
5625
5626		if (signed_op) {
5627			alu.src[0].sel = tmp2;
5628			alu.src[0].chan = 0;
5629		} else {
5630			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5631		}
5632
5633		alu.src[1].sel = tmp0;
5634		alu.src[1].chan = 1;
5635
5636		alu.last = 1;
5637		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5638			return r;
5639
5640		if (mod) { /* UMOD */
5641
5642			/* 15. tmp1.z = tmp0.w - src2			= r - src2 */
5643			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5644			alu.op = ALU_OP2_SUB_INT;
5645
5646			alu.dst.sel = tmp1;
5647			alu.dst.chan = 2;
5648			alu.dst.write = 1;
5649
5650			alu.src[0].sel = tmp0;
5651			alu.src[0].chan = 3;
5652
5653			if (signed_op) {
5654				alu.src[1].sel = tmp2;
5655				alu.src[1].chan = 1;
5656			} else {
5657				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5658			}
5659
5660			alu.last = 1;
5661			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5662				return r;
5663
5664			/* 16. tmp1.w = tmp0.w + src2			= r + src2 */
5665			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5666			alu.op = ALU_OP2_ADD_INT;
5667
5668			alu.dst.sel = tmp1;
5669			alu.dst.chan = 3;
5670			alu.dst.write = 1;
5671
5672			alu.src[0].sel = tmp0;
5673			alu.src[0].chan = 3;
5674			if (signed_op) {
5675				alu.src[1].sel = tmp2;
5676				alu.src[1].chan = 1;
5677			} else {
5678				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5679			}
5680
5681			alu.last = 1;
5682			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5683				return r;
5684
5685		} else { /* UDIV */
5686
5687			/* 15. tmp1.z = tmp0.z + 1       = q + 1       DIV */
5688			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5689			alu.op = ALU_OP2_ADD_INT;
5690
5691			alu.dst.sel = tmp1;
5692			alu.dst.chan = 2;
5693			alu.dst.write = 1;
5694
5695			alu.src[0].sel = tmp0;
5696			alu.src[0].chan = 2;
5697			alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
5698
5699			alu.last = 1;
5700			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5701				return r;
5702
5703			/* 16. tmp1.w = tmp0.z - 1			= q - 1 */
5704			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5705			alu.op = ALU_OP2_ADD_INT;
5706
5707			alu.dst.sel = tmp1;
5708			alu.dst.chan = 3;
5709			alu.dst.write = 1;
5710
5711			alu.src[0].sel = tmp0;
5712			alu.src[0].chan = 2;
5713			alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
5714
5715			alu.last = 1;
5716			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5717				return r;
5718
5719		}
5720
5721		/* 17. tmp1.x = tmp1.x & tmp1.y */
5722		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5723		alu.op = ALU_OP2_AND_INT;
5724
5725		alu.dst.sel = tmp1;
5726		alu.dst.chan = 0;
5727		alu.dst.write = 1;
5728
5729		alu.src[0].sel = tmp1;
5730		alu.src[0].chan = 0;
5731		alu.src[1].sel = tmp1;
5732		alu.src[1].chan = 1;
5733
5734		alu.last = 1;
5735		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5736			return r;
5737
5738		/* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z    DIV */
5739		/* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z    MOD */
5740		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5741		alu.op = ALU_OP3_CNDE_INT;
5742		alu.is_op3 = 1;
5743
5744		alu.dst.sel = tmp0;
5745		alu.dst.chan = 2;
5746		alu.dst.write = 1;
5747
5748		alu.src[0].sel = tmp1;
5749		alu.src[0].chan = 0;
5750		alu.src[1].sel = tmp0;
5751		alu.src[1].chan = mod ? 3 : 2;
5752		alu.src[2].sel = tmp1;
5753		alu.src[2].chan = 2;
5754
5755		alu.last = 1;
5756		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5757			return r;
5758
5759		/* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
5760		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5761		alu.op = ALU_OP3_CNDE_INT;
5762		alu.is_op3 = 1;
5763
5764		if (signed_op) {
5765			alu.dst.sel = tmp0;
5766			alu.dst.chan = 2;
5767			alu.dst.write = 1;
5768		} else {
5769			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5770		}
5771
5772		alu.src[0].sel = tmp1;
5773		alu.src[0].chan = 1;
5774		alu.src[1].sel = tmp1;
5775		alu.src[1].chan = 3;
5776		alu.src[2].sel = tmp0;
5777		alu.src[2].chan = 2;
5778
5779		alu.last = 1;
5780		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5781			return r;
5782
5783		if (signed_op) {
5784
5785			/* fix the sign of the result */
5786
5787			if (mod) {
5788
5789				/* tmp0.x = -tmp0.z */
5790				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5791				alu.op = ALU_OP2_SUB_INT;
5792
5793				alu.dst.sel = tmp0;
5794				alu.dst.chan = 0;
5795				alu.dst.write = 1;
5796
5797				alu.src[0].sel = V_SQ_ALU_SRC_0;
5798				alu.src[1].sel = tmp0;
5799				alu.src[1].chan = 2;
5800
5801				alu.last = 1;
5802				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5803					return r;
5804
5805				/* sign of the remainder is the same as the sign of src0 */
5806				/* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
5807				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5808				alu.op = ALU_OP3_CNDGE_INT;
5809				alu.is_op3 = 1;
5810
5811				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5812
5813				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5814				alu.src[1].sel = tmp0;
5815				alu.src[1].chan = 2;
5816				alu.src[2].sel = tmp0;
5817				alu.src[2].chan = 0;
5818
5819				alu.last = 1;
5820				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5821					return r;
5822
5823			} else {
5824
5825				/* tmp0.x = -tmp0.z */
5826				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5827				alu.op = ALU_OP2_SUB_INT;
5828
5829				alu.dst.sel = tmp0;
5830				alu.dst.chan = 0;
5831				alu.dst.write = 1;
5832
5833				alu.src[0].sel = V_SQ_ALU_SRC_0;
5834				alu.src[1].sel = tmp0;
5835				alu.src[1].chan = 2;
5836
5837				alu.last = 1;
5838				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5839					return r;
5840
5841				/* fix the quotient sign (same as the sign of src0*src1) */
5842				/* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
5843				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5844				alu.op = ALU_OP3_CNDGE_INT;
5845				alu.is_op3 = 1;
5846
5847				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5848
5849				alu.src[0].sel = tmp2;
5850				alu.src[0].chan = 2;
5851				alu.src[1].sel = tmp0;
5852				alu.src[1].chan = 2;
5853				alu.src[2].sel = tmp0;
5854				alu.src[2].chan = 0;
5855
5856				alu.last = 1;
5857				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5858					return r;
5859			}
5860		}
5861	}
5862	return 0;
5863}
5864
5865static int tgsi_udiv(struct r600_shader_ctx *ctx)
5866{
5867	return tgsi_divmod(ctx, 0, 0);
5868}
5869
5870static int tgsi_umod(struct r600_shader_ctx *ctx)
5871{
5872	return tgsi_divmod(ctx, 1, 0);
5873}
5874
5875static int tgsi_idiv(struct r600_shader_ctx *ctx)
5876{
5877	return tgsi_divmod(ctx, 0, 1);
5878}
5879
5880static int tgsi_imod(struct r600_shader_ctx *ctx)
5881{
5882	return tgsi_divmod(ctx, 1, 1);
5883}
5884
5885
5886static int tgsi_f2i(struct r600_shader_ctx *ctx)
5887{
5888	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5889	struct r600_bytecode_alu alu;
5890	int i, r;
5891	unsigned write_mask = inst->Dst[0].Register.WriteMask;
5892	int last_inst = tgsi_last_instruction(write_mask);
5893
5894	for (i = 0; i < 4; i++) {
5895		if (!(write_mask & (1<<i)))
5896			continue;
5897
5898		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5899		alu.op = ALU_OP1_TRUNC;
5900
5901		alu.dst.sel = ctx->temp_reg;
5902		alu.dst.chan = i;
5903		alu.dst.write = 1;
5904
5905		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5906		if (i == last_inst)
5907			alu.last = 1;
5908		r = r600_bytecode_add_alu(ctx->bc, &alu);
5909		if (r)
5910			return r;
5911	}
5912
5913	for (i = 0; i < 4; i++) {
5914		if (!(write_mask & (1<<i)))
5915			continue;
5916
5917		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5918		alu.op = ctx->inst_info->op;
5919
5920		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5921
5922		alu.src[0].sel = ctx->temp_reg;
5923		alu.src[0].chan = i;
5924
5925		if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
5926			alu.last = 1;
5927		r = r600_bytecode_add_alu(ctx->bc, &alu);
5928		if (r)
5929			return r;
5930	}
5931
5932	return 0;
5933}
5934
5935static int tgsi_iabs(struct r600_shader_ctx *ctx)
5936{
5937	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5938	struct r600_bytecode_alu alu;
5939	int i, r;
5940	unsigned write_mask = inst->Dst[0].Register.WriteMask;
5941	int last_inst = tgsi_last_instruction(write_mask);
5942
5943	/* tmp = -src */
5944	for (i = 0; i < 4; i++) {
5945		if (!(write_mask & (1<<i)))
5946			continue;
5947
5948		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5949		alu.op = ALU_OP2_SUB_INT;
5950
5951		alu.dst.sel = ctx->temp_reg;
5952		alu.dst.chan = i;
5953		alu.dst.write = 1;
5954
5955		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5956		alu.src[0].sel = V_SQ_ALU_SRC_0;
5957
5958		if (i == last_inst)
5959			alu.last = 1;
5960		r = r600_bytecode_add_alu(ctx->bc, &alu);
5961		if (r)
5962			return r;
5963	}
5964
5965	/* dst = (src >= 0 ? src : tmp) */
5966	for (i = 0; i < 4; i++) {
5967		if (!(write_mask & (1<<i)))
5968			continue;
5969
5970		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5971		alu.op = ALU_OP3_CNDGE_INT;
5972		alu.is_op3 = 1;
5973		alu.dst.write = 1;
5974
5975		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5976
5977		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5978		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5979		alu.src[2].sel = ctx->temp_reg;
5980		alu.src[2].chan = i;
5981
5982		if (i == last_inst)
5983			alu.last = 1;
5984		r = r600_bytecode_add_alu(ctx->bc, &alu);
5985		if (r)
5986			return r;
5987	}
5988	return 0;
5989}
5990
5991static int tgsi_issg(struct r600_shader_ctx *ctx)
5992{
5993	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5994	struct r600_bytecode_alu alu;
5995	int i, r;
5996	unsigned write_mask = inst->Dst[0].Register.WriteMask;
5997	int last_inst = tgsi_last_instruction(write_mask);
5998
5999	/* tmp = (src >= 0 ? src : -1) */
6000	for (i = 0; i < 4; i++) {
6001		if (!(write_mask & (1<<i)))
6002			continue;
6003
6004		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6005		alu.op = ALU_OP3_CNDGE_INT;
6006		alu.is_op3 = 1;
6007
6008		alu.dst.sel = ctx->temp_reg;
6009		alu.dst.chan = i;
6010		alu.dst.write = 1;
6011
6012		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6013		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6014		alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
6015
6016		if (i == last_inst)
6017			alu.last = 1;
6018		r = r600_bytecode_add_alu(ctx->bc, &alu);
6019		if (r)
6020			return r;
6021	}
6022
6023	/* dst = (tmp > 0 ? 1 : tmp) */
6024	for (i = 0; i < 4; i++) {
6025		if (!(write_mask & (1<<i)))
6026			continue;
6027
6028		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6029		alu.op = ALU_OP3_CNDGT_INT;
6030		alu.is_op3 = 1;
6031		alu.dst.write = 1;
6032
6033		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6034
6035		alu.src[0].sel = ctx->temp_reg;
6036		alu.src[0].chan = i;
6037
6038		alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6039
6040		alu.src[2].sel = ctx->temp_reg;
6041		alu.src[2].chan = i;
6042
6043		if (i == last_inst)
6044			alu.last = 1;
6045		r = r600_bytecode_add_alu(ctx->bc, &alu);
6046		if (r)
6047			return r;
6048	}
6049	return 0;
6050}
6051
6052
6053
6054static int tgsi_ssg(struct r600_shader_ctx *ctx)
6055{
6056	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6057	struct r600_bytecode_alu alu;
6058	int i, r;
6059
6060	/* tmp = (src > 0 ? 1 : src) */
6061	for (i = 0; i < 4; i++) {
6062		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6063		alu.op = ALU_OP3_CNDGT;
6064		alu.is_op3 = 1;
6065
6066		alu.dst.sel = ctx->temp_reg;
6067		alu.dst.chan = i;
6068
6069		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6070		alu.src[1].sel = V_SQ_ALU_SRC_1;
6071		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6072
6073		if (i == 3)
6074			alu.last = 1;
6075		r = r600_bytecode_add_alu(ctx->bc, &alu);
6076		if (r)
6077			return r;
6078	}
6079
6080	/* dst = (-tmp > 0 ? -1 : tmp) */
6081	for (i = 0; i < 4; i++) {
6082		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6083		alu.op = ALU_OP3_CNDGT;
6084		alu.is_op3 = 1;
6085		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6086
6087		alu.src[0].sel = ctx->temp_reg;
6088		alu.src[0].chan = i;
6089		alu.src[0].neg = 1;
6090
6091		alu.src[1].sel = V_SQ_ALU_SRC_1;
6092		alu.src[1].neg = 1;
6093
6094		alu.src[2].sel = ctx->temp_reg;
6095		alu.src[2].chan = i;
6096
6097		if (i == 3)
6098			alu.last = 1;
6099		r = r600_bytecode_add_alu(ctx->bc, &alu);
6100		if (r)
6101			return r;
6102	}
6103	return 0;
6104}
6105
6106static int tgsi_bfi(struct r600_shader_ctx *ctx)
6107{
6108	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6109	struct r600_bytecode_alu alu;
6110	int i, r, t1, t2;
6111
6112	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6113	int last_inst = tgsi_last_instruction(write_mask);
6114
6115	t1 = ctx->temp_reg;
6116
6117	for (i = 0; i < 4; i++) {
6118		if (!(write_mask & (1<<i)))
6119			continue;
6120
6121		/* create mask tmp */
6122		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6123		alu.op = ALU_OP2_BFM_INT;
6124		alu.dst.sel = t1;
6125		alu.dst.chan = i;
6126		alu.dst.write = 1;
6127		alu.last = i == last_inst;
6128
6129		r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6130		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6131
6132		r = r600_bytecode_add_alu(ctx->bc, &alu);
6133		if (r)
6134			return r;
6135	}
6136
6137	t2 = r600_get_temp(ctx);
6138
6139	for (i = 0; i < 4; i++) {
6140		if (!(write_mask & (1<<i)))
6141			continue;
6142
6143		/* shift insert left */
6144		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6145		alu.op = ALU_OP2_LSHL_INT;
6146		alu.dst.sel = t2;
6147		alu.dst.chan = i;
6148		alu.dst.write = 1;
6149		alu.last = i == last_inst;
6150
6151		r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6152		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6153
6154		r = r600_bytecode_add_alu(ctx->bc, &alu);
6155		if (r)
6156			return r;
6157	}
6158
6159	for (i = 0; i < 4; i++) {
6160		if (!(write_mask & (1<<i)))
6161			continue;
6162
6163		/* actual bitfield insert */
6164		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6165		alu.op = ALU_OP3_BFI_INT;
6166		alu.is_op3 = 1;
6167		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6168		alu.dst.chan = i;
6169		alu.dst.write = 1;
6170		alu.last = i == last_inst;
6171
6172		alu.src[0].sel = t1;
6173		alu.src[0].chan = i;
6174		alu.src[1].sel = t2;
6175		alu.src[1].chan = i;
6176		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6177
6178		r = r600_bytecode_add_alu(ctx->bc, &alu);
6179		if (r)
6180			return r;
6181	}
6182
6183	return 0;
6184}
6185
6186static int tgsi_msb(struct r600_shader_ctx *ctx)
6187{
6188	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6189	struct r600_bytecode_alu alu;
6190	int i, r, t1, t2;
6191
6192	unsigned write_mask = inst->Dst[0].Register.WriteMask;
6193	int last_inst = tgsi_last_instruction(write_mask);
6194
6195	assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
6196		ctx->inst_info->op == ALU_OP1_FFBH_UINT);
6197
6198	t1 = ctx->temp_reg;
6199
6200	/* bit position is indexed from lsb by TGSI, and from msb by the hardware */
6201	for (i = 0; i < 4; i++) {
6202		if (!(write_mask & (1<<i)))
6203			continue;
6204
6205		/* t1 = FFBH_INT / FFBH_UINT */
6206		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6207		alu.op = ctx->inst_info->op;
6208		alu.dst.sel = t1;
6209		alu.dst.chan = i;
6210		alu.dst.write = 1;
6211		alu.last = i == last_inst;
6212
6213		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6214
6215		r = r600_bytecode_add_alu(ctx->bc, &alu);
6216		if (r)
6217			return r;
6218	}
6219
6220	t2 = r600_get_temp(ctx);
6221
6222	for (i = 0; i < 4; i++) {
6223		if (!(write_mask & (1<<i)))
6224			continue;
6225
6226		/* t2 = 31 - t1 */
6227		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6228		alu.op = ALU_OP2_SUB_INT;
6229		alu.dst.sel = t2;
6230		alu.dst.chan = i;
6231		alu.dst.write = 1;
6232		alu.last = i == last_inst;
6233
6234		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
6235		alu.src[0].value = 31;
6236		alu.src[1].sel = t1;
6237		alu.src[1].chan = i;
6238
6239		r = r600_bytecode_add_alu(ctx->bc, &alu);
6240		if (r)
6241			return r;
6242	}
6243
6244	for (i = 0; i < 4; i++) {
6245		if (!(write_mask & (1<<i)))
6246			continue;
6247
6248		/* result = t1 >= 0 ? t2 : t1 */
6249		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6250		alu.op = ALU_OP3_CNDGE_INT;
6251		alu.is_op3 = 1;
6252		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6253		alu.dst.chan = i;
6254		alu.dst.write = 1;
6255		alu.last = i == last_inst;
6256
6257		alu.src[0].sel = t1;
6258		alu.src[0].chan = i;
6259		alu.src[1].sel = t2;
6260		alu.src[1].chan = i;
6261		alu.src[2].sel = t1;
6262		alu.src[2].chan = i;
6263
6264		r = r600_bytecode_add_alu(ctx->bc, &alu);
6265		if (r)
6266			return r;
6267	}
6268
6269	return 0;
6270}
6271
6272static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
6273{
6274	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6275	struct r600_bytecode_alu alu;
6276	int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
6277	unsigned location;
6278	int input;
6279
6280	assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
6281
6282	input = inst->Src[0].Register.Index;
6283
6284	/* Interpolators have been marked for use already by allocate_system_value_inputs */
6285	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6286		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6287		location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
6288	}
6289	else {
6290		location = TGSI_INTERPOLATE_LOC_CENTROID;
6291	}
6292
6293	k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
6294	if (k < 0)
6295		k = 0;
6296	interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
6297	interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
6298
6299	/* NOTE: currently offset is not perspective correct */
6300	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6301		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6302		int sample_gpr = -1;
6303		int gradientsH, gradientsV;
6304		struct r600_bytecode_tex tex;
6305
6306		if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6307			sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
6308		}
6309
6310		gradientsH = r600_get_temp(ctx);
6311		gradientsV = r600_get_temp(ctx);
6312		for (i = 0; i < 2; i++) {
6313			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6314			tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
6315			tex.src_gpr = interp_gpr;
6316			tex.src_sel_x = interp_base_chan + 0;
6317			tex.src_sel_y = interp_base_chan + 1;
6318			tex.src_sel_z = 0;
6319			tex.src_sel_w = 0;
6320			tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
6321			tex.dst_sel_x = 0;
6322			tex.dst_sel_y = 1;
6323			tex.dst_sel_z = 7;
6324			tex.dst_sel_w = 7;
6325			tex.inst_mod = 1; // Use per pixel gradient calculation
6326			tex.sampler_id = 0;
6327			tex.resource_id = tex.sampler_id;
6328			r = r600_bytecode_add_tex(ctx->bc, &tex);
6329			if (r)
6330				return r;
6331		}
6332
6333		for (i = 0; i < 2; i++) {
6334			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6335			alu.op = ALU_OP3_MULADD;
6336			alu.is_op3 = 1;
6337			alu.src[0].sel = gradientsH;
6338			alu.src[0].chan = i;
6339			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6340				alu.src[1].sel = sample_gpr;
6341				alu.src[1].chan = 2;
6342			}
6343			else {
6344				r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
6345			}
6346			alu.src[2].sel = interp_gpr;
6347			alu.src[2].chan = interp_base_chan + i;
6348			alu.dst.sel = ctx->temp_reg;
6349			alu.dst.chan = i;
6350			alu.last = i == 1;
6351
6352			r = r600_bytecode_add_alu(ctx->bc, &alu);
6353			if (r)
6354				return r;
6355		}
6356
6357		for (i = 0; i < 2; i++) {
6358			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6359			alu.op = ALU_OP3_MULADD;
6360			alu.is_op3 = 1;
6361			alu.src[0].sel = gradientsV;
6362			alu.src[0].chan = i;
6363			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6364				alu.src[1].sel = sample_gpr;
6365				alu.src[1].chan = 3;
6366			}
6367			else {
6368				r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
6369			}
6370			alu.src[2].sel = ctx->temp_reg;
6371			alu.src[2].chan = i;
6372			alu.dst.sel = ctx->temp_reg;
6373			alu.dst.chan = i;
6374			alu.last = i == 1;
6375
6376			r = r600_bytecode_add_alu(ctx->bc, &alu);
6377			if (r)
6378				return r;
6379		}
6380	}
6381
6382	tmp = r600_get_temp(ctx);
6383	for (i = 0; i < 8; i++) {
6384		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6385		alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
6386
6387		alu.dst.sel = tmp;
6388		if ((i > 1 && i < 6)) {
6389			alu.dst.write = 1;
6390		}
6391		else {
6392			alu.dst.write = 0;
6393		}
6394		alu.dst.chan = i % 4;
6395
6396		if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6397			inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6398			alu.src[0].sel = ctx->temp_reg;
6399			alu.src[0].chan = 1 - (i % 2);
6400		} else {
6401			alu.src[0].sel = interp_gpr;
6402			alu.src[0].chan = interp_base_chan + 1 - (i % 2);
6403		}
6404		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
6405		alu.src[1].chan = 0;
6406
6407		alu.last = i % 4 == 3;
6408		alu.bank_swizzle_force = SQ_ALU_VEC_210;
6409
6410		r = r600_bytecode_add_alu(ctx->bc, &alu);
6411		if (r)
6412			return r;
6413	}
6414
6415	// INTERP can't swizzle dst
6416	lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6417	for (i = 0; i <= lasti; i++) {
6418		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6419			continue;
6420
6421		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6422		alu.op = ALU_OP1_MOV;
6423		alu.src[0].sel = tmp;
6424		alu.src[0].chan = ctx->src[0].swizzle[i];
6425		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6426		alu.dst.write = 1;
6427		alu.last = i == lasti;
6428		r = r600_bytecode_add_alu(ctx->bc, &alu);
6429		if (r)
6430			return r;
6431	}
6432
6433	return 0;
6434}
6435
6436
6437static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
6438{
6439	struct r600_bytecode_alu alu;
6440	int i, r;
6441
6442	for (i = 0; i < 4; i++) {
6443		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6444		if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
6445			alu.op = ALU_OP0_NOP;
6446			alu.dst.chan = i;
6447		} else {
6448			alu.op = ALU_OP1_MOV;
6449			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6450			alu.src[0].sel = ctx->temp_reg;
6451			alu.src[0].chan = i;
6452		}
6453		if (i == 3) {
6454			alu.last = 1;
6455		}
6456		r = r600_bytecode_add_alu(ctx->bc, &alu);
6457		if (r)
6458			return r;
6459	}
6460	return 0;
6461}
6462
6463static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
6464                                 unsigned temp, int chan,
6465                                 struct r600_bytecode_alu_src *bc_src,
6466                                 const struct r600_shader_src *shader_src)
6467{
6468	struct r600_bytecode_alu alu;
6469	int r;
6470
6471	r600_bytecode_src(bc_src, shader_src, chan);
6472
6473	/* op3 operands don't support abs modifier */
6474	if (bc_src->abs) {
6475		assert(temp!=0);      /* we actually need the extra register, make sure it is allocated. */
6476		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6477		alu.op = ALU_OP1_MOV;
6478		alu.dst.sel = temp;
6479		alu.dst.chan = chan;
6480		alu.dst.write = 1;
6481
6482		alu.src[0] = *bc_src;
6483		alu.last = true; // sufficient?
6484		r = r600_bytecode_add_alu(ctx->bc, &alu);
6485		if (r)
6486			return r;
6487
6488		memset(bc_src, 0, sizeof(*bc_src));
6489		bc_src->sel = temp;
6490		bc_src->chan = chan;
6491	}
6492	return 0;
6493}
6494
6495static int tgsi_op3(struct r600_shader_ctx *ctx)
6496{
6497	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6498	struct r600_bytecode_alu alu;
6499	int i, j, r;
6500	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6501	int temp_regs[4];
6502
6503	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6504		temp_regs[j] = 0;
6505		if (ctx->src[j].abs)
6506			temp_regs[j] = r600_get_temp(ctx);
6507	}
6508	for (i = 0; i < lasti + 1; i++) {
6509		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6510			continue;
6511
6512		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6513		alu.op = ctx->inst_info->op;
6514		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6515			r = tgsi_make_src_for_op3(ctx, temp_regs[j], i, &alu.src[j], &ctx->src[j]);
6516			if (r)
6517				return r;
6518		}
6519
6520		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6521		alu.dst.chan = i;
6522		alu.dst.write = 1;
6523		alu.is_op3 = 1;
6524		if (i == lasti) {
6525			alu.last = 1;
6526		}
6527		r = r600_bytecode_add_alu(ctx->bc, &alu);
6528		if (r)
6529			return r;
6530	}
6531	return 0;
6532}
6533
6534static int tgsi_dp(struct r600_shader_ctx *ctx)
6535{
6536	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6537	struct r600_bytecode_alu alu;
6538	int i, j, r;
6539
6540	for (i = 0; i < 4; i++) {
6541		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6542		alu.op = ctx->inst_info->op;
6543		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6544			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
6545		}
6546
6547		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6548		alu.dst.chan = i;
6549		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
6550		/* handle some special cases */
6551		switch (inst->Instruction.Opcode) {
6552		case TGSI_OPCODE_DP2:
6553			if (i > 1) {
6554				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
6555				alu.src[0].chan = alu.src[1].chan = 0;
6556			}
6557			break;
6558		case TGSI_OPCODE_DP3:
6559			if (i > 2) {
6560				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
6561				alu.src[0].chan = alu.src[1].chan = 0;
6562			}
6563			break;
6564		case TGSI_OPCODE_DPH:
6565			if (i == 3) {
6566				alu.src[0].sel = V_SQ_ALU_SRC_1;
6567				alu.src[0].chan = 0;
6568				alu.src[0].neg = 0;
6569			}
6570			break;
6571		default:
6572			break;
6573		}
6574		if (i == 3) {
6575			alu.last = 1;
6576		}
6577		r = r600_bytecode_add_alu(ctx->bc, &alu);
6578		if (r)
6579			return r;
6580	}
6581	return 0;
6582}
6583
6584static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
6585						    unsigned index)
6586{
6587	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6588	return 	(inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
6589		inst->Src[index].Register.File != TGSI_FILE_INPUT &&
6590		inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
6591		ctx->src[index].neg || ctx->src[index].abs ||
6592		(inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == TGSI_PROCESSOR_GEOMETRY);
6593}
6594
6595static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
6596					unsigned index)
6597{
6598	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6599	return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
6600}
6601
6602static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
6603{
6604	struct r600_bytecode_vtx vtx;
6605	struct r600_bytecode_alu alu;
6606	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6607	int src_gpr, r, i;
6608	int id = tgsi_tex_get_src_gpr(ctx, 1);
6609
6610	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
6611	if (src_requires_loading) {
6612		for (i = 0; i < 4; i++) {
6613			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6614			alu.op = ALU_OP1_MOV;
6615			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6616			alu.dst.sel = ctx->temp_reg;
6617			alu.dst.chan = i;
6618			if (i == 3)
6619				alu.last = 1;
6620			alu.dst.write = 1;
6621			r = r600_bytecode_add_alu(ctx->bc, &alu);
6622			if (r)
6623				return r;
6624		}
6625		src_gpr = ctx->temp_reg;
6626	}
6627
6628	memset(&vtx, 0, sizeof(vtx));
6629	vtx.op = FETCH_OP_VFETCH;
6630	vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
6631	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
6632	vtx.src_gpr = src_gpr;
6633	vtx.mega_fetch_count = 16;
6634	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
6635	vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;		/* SEL_X */
6636	vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;		/* SEL_Y */
6637	vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;		/* SEL_Z */
6638	vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;		/* SEL_W */
6639	vtx.use_const_fields = 1;
6640
6641	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
6642		return r;
6643
6644	if (ctx->bc->chip_class >= EVERGREEN)
6645		return 0;
6646
6647	for (i = 0; i < 4; i++) {
6648		int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6649		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6650			continue;
6651
6652		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6653		alu.op = ALU_OP2_AND_INT;
6654
6655		alu.dst.chan = i;
6656		alu.dst.sel = vtx.dst_gpr;
6657		alu.dst.write = 1;
6658
6659		alu.src[0].sel = vtx.dst_gpr;
6660		alu.src[0].chan = i;
6661
6662		alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;
6663		alu.src[1].sel += (id * 2);
6664		alu.src[1].chan = i % 4;
6665		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6666
6667		if (i == lasti)
6668			alu.last = 1;
6669		r = r600_bytecode_add_alu(ctx->bc, &alu);
6670		if (r)
6671			return r;
6672	}
6673
6674	if (inst->Dst[0].Register.WriteMask & 3) {
6675		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6676		alu.op = ALU_OP2_OR_INT;
6677
6678		alu.dst.chan = 3;
6679		alu.dst.sel = vtx.dst_gpr;
6680		alu.dst.write = 1;
6681
6682		alu.src[0].sel = vtx.dst_gpr;
6683		alu.src[0].chan = 3;
6684
6685		alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;
6686		alu.src[1].chan = 0;
6687		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6688
6689		alu.last = 1;
6690		r = r600_bytecode_add_alu(ctx->bc, &alu);
6691		if (r)
6692			return r;
6693	}
6694	return 0;
6695}
6696
6697static int r600_do_buffer_txq(struct r600_shader_ctx *ctx)
6698{
6699	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6700	struct r600_bytecode_alu alu;
6701	int r;
6702	int id = tgsi_tex_get_src_gpr(ctx, 1);
6703
6704	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6705	alu.op = ALU_OP1_MOV;
6706	alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
6707	if (ctx->bc->chip_class >= EVERGREEN) {
6708		/* channel 0 or 2 of each word */
6709		alu.src[0].sel += (id / 2);
6710		alu.src[0].chan = (id % 2) * 2;
6711	} else {
6712		/* r600 we have them at channel 2 of the second dword */
6713		alu.src[0].sel += (id * 2) + 1;
6714		alu.src[0].chan = 1;
6715	}
6716	alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6717	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
6718	alu.last = 1;
6719	r = r600_bytecode_add_alu(ctx->bc, &alu);
6720	if (r)
6721		return r;
6722	return 0;
6723}
6724
6725static int tgsi_tex(struct r600_shader_ctx *ctx)
6726{
6727	static float one_point_five = 1.5f;
6728	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6729	struct r600_bytecode_tex tex;
6730	struct r600_bytecode_alu alu;
6731	unsigned src_gpr;
6732	int r, i, j;
6733	int opcode;
6734	bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
6735				    inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
6736				    (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
6737				     inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
6738
6739	bool txf_add_offsets = inst->Texture.NumOffsets &&
6740			     inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
6741			     inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
6742
6743	/* Texture fetch instructions can only use gprs as source.
6744	 * Also they cannot negate the source or take the absolute value */
6745	const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
6746					      inst->Instruction.Opcode != TGSI_OPCODE_TXQS &&
6747                                              tgsi_tex_src_requires_loading(ctx, 0)) ||
6748					     read_compressed_msaa || txf_add_offsets;
6749
6750	boolean src_loaded = FALSE;
6751	unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1;
6752	int8_t offset_x = 0, offset_y = 0, offset_z = 0;
6753	boolean has_txq_cube_array_z = false;
6754	unsigned sampler_index_mode;
6755
6756	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
6757	    ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6758	      inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
6759		if (inst->Dst[0].Register.WriteMask & 4) {
6760			ctx->shader->has_txq_cube_array_z_comp = true;
6761			has_txq_cube_array_z = true;
6762		}
6763
6764	if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
6765	    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
6766	    inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
6767	    inst->Instruction.Opcode == TGSI_OPCODE_TG4)
6768		sampler_src_reg = 2;
6769
6770	/* TGSI moves the sampler to src reg 3 for TXD */
6771	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
6772		sampler_src_reg = 3;
6773
6774	sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
6775
6776	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
6777
6778	if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
6779		if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
6780			ctx->shader->uses_tex_buffers = true;
6781			return r600_do_buffer_txq(ctx);
6782		}
6783		else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
6784			if (ctx->bc->chip_class < EVERGREEN)
6785				ctx->shader->uses_tex_buffers = true;
6786			return do_vtx_fetch_inst(ctx, src_requires_loading);
6787		}
6788	}
6789
6790	if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
6791		int out_chan;
6792		/* Add perspective divide */
6793		if (ctx->bc->chip_class == CAYMAN) {
6794			out_chan = 2;
6795			for (i = 0; i < 3; i++) {
6796				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6797				alu.op = ALU_OP1_RECIP_IEEE;
6798				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6799
6800				alu.dst.sel = ctx->temp_reg;
6801				alu.dst.chan = i;
6802				if (i == 2)
6803					alu.last = 1;
6804				if (out_chan == i)
6805					alu.dst.write = 1;
6806				r = r600_bytecode_add_alu(ctx->bc, &alu);
6807				if (r)
6808					return r;
6809			}
6810
6811		} else {
6812			out_chan = 3;
6813			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6814			alu.op = ALU_OP1_RECIP_IEEE;
6815			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6816
6817			alu.dst.sel = ctx->temp_reg;
6818			alu.dst.chan = out_chan;
6819			alu.last = 1;
6820			alu.dst.write = 1;
6821			r = r600_bytecode_add_alu(ctx->bc, &alu);
6822			if (r)
6823				return r;
6824		}
6825
6826		for (i = 0; i < 3; i++) {
6827			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6828			alu.op = ALU_OP2_MUL;
6829			alu.src[0].sel = ctx->temp_reg;
6830			alu.src[0].chan = out_chan;
6831			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6832			alu.dst.sel = ctx->temp_reg;
6833			alu.dst.chan = i;
6834			alu.dst.write = 1;
6835			r = r600_bytecode_add_alu(ctx->bc, &alu);
6836			if (r)
6837				return r;
6838		}
6839		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6840		alu.op = ALU_OP1_MOV;
6841		alu.src[0].sel = V_SQ_ALU_SRC_1;
6842		alu.src[0].chan = 0;
6843		alu.dst.sel = ctx->temp_reg;
6844		alu.dst.chan = 3;
6845		alu.last = 1;
6846		alu.dst.write = 1;
6847		r = r600_bytecode_add_alu(ctx->bc, &alu);
6848		if (r)
6849			return r;
6850		src_loaded = TRUE;
6851		src_gpr = ctx->temp_reg;
6852	}
6853
6854
6855	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
6856	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6857	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
6858	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
6859	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
6860	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
6861
6862		static const unsigned src0_swizzle[] = {2, 2, 0, 1};
6863		static const unsigned src1_swizzle[] = {1, 0, 2, 2};
6864
6865		/* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
6866		for (i = 0; i < 4; i++) {
6867			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6868			alu.op = ALU_OP2_CUBE;
6869			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
6870			r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
6871			alu.dst.sel = ctx->temp_reg;
6872			alu.dst.chan = i;
6873			if (i == 3)
6874				alu.last = 1;
6875			alu.dst.write = 1;
6876			r = r600_bytecode_add_alu(ctx->bc, &alu);
6877			if (r)
6878				return r;
6879		}
6880
6881		/* tmp1.z = RCP_e(|tmp1.z|) */
6882		if (ctx->bc->chip_class == CAYMAN) {
6883			for (i = 0; i < 3; i++) {
6884				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6885				alu.op = ALU_OP1_RECIP_IEEE;
6886				alu.src[0].sel = ctx->temp_reg;
6887				alu.src[0].chan = 2;
6888				alu.src[0].abs = 1;
6889				alu.dst.sel = ctx->temp_reg;
6890				alu.dst.chan = i;
6891				if (i == 2)
6892					alu.dst.write = 1;
6893				if (i == 2)
6894					alu.last = 1;
6895				r = r600_bytecode_add_alu(ctx->bc, &alu);
6896				if (r)
6897					return r;
6898			}
6899		} else {
6900			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6901			alu.op = ALU_OP1_RECIP_IEEE;
6902			alu.src[0].sel = ctx->temp_reg;
6903			alu.src[0].chan = 2;
6904			alu.src[0].abs = 1;
6905			alu.dst.sel = ctx->temp_reg;
6906			alu.dst.chan = 2;
6907			alu.dst.write = 1;
6908			alu.last = 1;
6909			r = r600_bytecode_add_alu(ctx->bc, &alu);
6910			if (r)
6911				return r;
6912		}
6913
6914		/* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
6915		 * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
6916		 * muladd has no writemask, have to use another temp
6917		 */
6918		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6919		alu.op = ALU_OP3_MULADD;
6920		alu.is_op3 = 1;
6921
6922		alu.src[0].sel = ctx->temp_reg;
6923		alu.src[0].chan = 0;
6924		alu.src[1].sel = ctx->temp_reg;
6925		alu.src[1].chan = 2;
6926
6927		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
6928		alu.src[2].chan = 0;
6929		alu.src[2].value = *(uint32_t *)&one_point_five;
6930
6931		alu.dst.sel = ctx->temp_reg;
6932		alu.dst.chan = 0;
6933		alu.dst.write = 1;
6934
6935		r = r600_bytecode_add_alu(ctx->bc, &alu);
6936		if (r)
6937			return r;
6938
6939		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6940		alu.op = ALU_OP3_MULADD;
6941		alu.is_op3 = 1;
6942
6943		alu.src[0].sel = ctx->temp_reg;
6944		alu.src[0].chan = 1;
6945		alu.src[1].sel = ctx->temp_reg;
6946		alu.src[1].chan = 2;
6947
6948		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
6949		alu.src[2].chan = 0;
6950		alu.src[2].value = *(uint32_t *)&one_point_five;
6951
6952		alu.dst.sel = ctx->temp_reg;
6953		alu.dst.chan = 1;
6954		alu.dst.write = 1;
6955
6956		alu.last = 1;
6957		r = r600_bytecode_add_alu(ctx->bc, &alu);
6958		if (r)
6959			return r;
6960		/* write initial compare value into Z component
6961		  - W src 0 for shadow cube
6962		  - X src 1 for shadow cube array */
6963		if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
6964		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
6965			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6966			alu.op = ALU_OP1_MOV;
6967			if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
6968				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
6969			else
6970				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6971			alu.dst.sel = ctx->temp_reg;
6972			alu.dst.chan = 2;
6973			alu.dst.write = 1;
6974			alu.last = 1;
6975			r = r600_bytecode_add_alu(ctx->bc, &alu);
6976			if (r)
6977				return r;
6978		}
6979
6980		if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6981		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
6982			if (ctx->bc->chip_class >= EVERGREEN) {
6983				int mytmp = r600_get_temp(ctx);
6984				static const float eight = 8.0f;
6985				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6986				alu.op = ALU_OP1_MOV;
6987				alu.src[0].sel = ctx->temp_reg;
6988				alu.src[0].chan = 3;
6989				alu.dst.sel = mytmp;
6990				alu.dst.chan = 0;
6991				alu.dst.write = 1;
6992				alu.last = 1;
6993				r = r600_bytecode_add_alu(ctx->bc, &alu);
6994				if (r)
6995					return r;
6996
6997				/* have to multiply original layer by 8 and add to face id (temp.w) in Z */
6998				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6999				alu.op = ALU_OP3_MULADD;
7000				alu.is_op3 = 1;
7001				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7002				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7003				alu.src[1].chan = 0;
7004				alu.src[1].value = *(uint32_t *)&eight;
7005				alu.src[2].sel = mytmp;
7006				alu.src[2].chan = 0;
7007				alu.dst.sel = ctx->temp_reg;
7008				alu.dst.chan = 3;
7009				alu.dst.write = 1;
7010				alu.last = 1;
7011				r = r600_bytecode_add_alu(ctx->bc, &alu);
7012				if (r)
7013					return r;
7014			} else if (ctx->bc->chip_class < EVERGREEN) {
7015				memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7016				tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
7017				tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7018				tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7019				tex.src_gpr = r600_get_temp(ctx);
7020				tex.src_sel_x = 0;
7021				tex.src_sel_y = 0;
7022				tex.src_sel_z = 0;
7023				tex.src_sel_w = 0;
7024				tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7025				tex.coord_type_x = 1;
7026				tex.coord_type_y = 1;
7027				tex.coord_type_z = 1;
7028				tex.coord_type_w = 1;
7029				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7030				alu.op = ALU_OP1_MOV;
7031				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7032				alu.dst.sel = tex.src_gpr;
7033				alu.dst.chan = 0;
7034				alu.last = 1;
7035				alu.dst.write = 1;
7036				r = r600_bytecode_add_alu(ctx->bc, &alu);
7037				if (r)
7038					return r;
7039
7040				r = r600_bytecode_add_tex(ctx->bc, &tex);
7041				if (r)
7042					return r;
7043			}
7044
7045		}
7046
7047		/* for cube forms of lod and bias we need to route things */
7048		if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
7049		    inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
7050		    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7051		    inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
7052			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7053			alu.op = ALU_OP1_MOV;
7054			if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7055			    inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
7056				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7057			else
7058				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7059			alu.dst.sel = ctx->temp_reg;
7060			alu.dst.chan = 2;
7061			alu.last = 1;
7062			alu.dst.write = 1;
7063			r = r600_bytecode_add_alu(ctx->bc, &alu);
7064			if (r)
7065				return r;
7066		}
7067
7068		src_loaded = TRUE;
7069		src_gpr = ctx->temp_reg;
7070	}
7071
7072	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
7073		int temp_h = 0, temp_v = 0;
7074		int start_val = 0;
7075
7076		/* if we've already loaded the src (i.e. CUBE don't reload it). */
7077		if (src_loaded == TRUE)
7078			start_val = 1;
7079		else
7080			src_loaded = TRUE;
7081		for (i = start_val; i < 3; i++) {
7082			int treg = r600_get_temp(ctx);
7083
7084			if (i == 0)
7085				src_gpr = treg;
7086			else if (i == 1)
7087				temp_h = treg;
7088			else
7089				temp_v = treg;
7090
7091			for (j = 0; j < 4; j++) {
7092				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7093				alu.op = ALU_OP1_MOV;
7094                                r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
7095                                alu.dst.sel = treg;
7096                                alu.dst.chan = j;
7097                                if (j == 3)
7098                                   alu.last = 1;
7099                                alu.dst.write = 1;
7100                                r = r600_bytecode_add_alu(ctx->bc, &alu);
7101                                if (r)
7102                                    return r;
7103			}
7104		}
7105		for (i = 1; i < 3; i++) {
7106			/* set gradients h/v */
7107			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7108			tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
7109				FETCH_OP_SET_GRADIENTS_V;
7110			tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7111			tex.sampler_index_mode = sampler_index_mode;
7112			tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7113			tex.resource_index_mode = sampler_index_mode;
7114
7115			tex.src_gpr = (i == 1) ? temp_h : temp_v;
7116			tex.src_sel_x = 0;
7117			tex.src_sel_y = 1;
7118			tex.src_sel_z = 2;
7119			tex.src_sel_w = 3;
7120
7121			tex.dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
7122			tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7123			if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
7124				tex.coord_type_x = 1;
7125				tex.coord_type_y = 1;
7126				tex.coord_type_z = 1;
7127				tex.coord_type_w = 1;
7128			}
7129			r = r600_bytecode_add_tex(ctx->bc, &tex);
7130			if (r)
7131				return r;
7132		}
7133	}
7134
7135	if (src_requires_loading && !src_loaded) {
7136		for (i = 0; i < 4; i++) {
7137			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7138			alu.op = ALU_OP1_MOV;
7139			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7140			alu.dst.sel = ctx->temp_reg;
7141			alu.dst.chan = i;
7142			if (i == 3)
7143				alu.last = 1;
7144			alu.dst.write = 1;
7145			r = r600_bytecode_add_alu(ctx->bc, &alu);
7146			if (r)
7147				return r;
7148		}
7149		src_loaded = TRUE;
7150		src_gpr = ctx->temp_reg;
7151	}
7152
7153	/* get offset values */
7154	if (inst->Texture.NumOffsets) {
7155		assert(inst->Texture.NumOffsets == 1);
7156
7157		/* The texture offset feature doesn't work with the TXF instruction
7158		 * and must be emulated by adding the offset to the texture coordinates. */
7159		if (txf_add_offsets) {
7160			const struct tgsi_texture_offset *off = inst->TexOffsets;
7161
7162			switch (inst->Texture.Texture) {
7163			case TGSI_TEXTURE_3D:
7164				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7165				alu.op = ALU_OP2_ADD_INT;
7166				alu.src[0].sel = src_gpr;
7167				alu.src[0].chan = 2;
7168				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7169				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
7170				alu.dst.sel = src_gpr;
7171				alu.dst.chan = 2;
7172				alu.dst.write = 1;
7173				alu.last = 1;
7174				r = r600_bytecode_add_alu(ctx->bc, &alu);
7175				if (r)
7176					return r;
7177				/* fall through */
7178
7179			case TGSI_TEXTURE_2D:
7180			case TGSI_TEXTURE_SHADOW2D:
7181			case TGSI_TEXTURE_RECT:
7182			case TGSI_TEXTURE_SHADOWRECT:
7183			case TGSI_TEXTURE_2D_ARRAY:
7184			case TGSI_TEXTURE_SHADOW2D_ARRAY:
7185				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7186				alu.op = ALU_OP2_ADD_INT;
7187				alu.src[0].sel = src_gpr;
7188				alu.src[0].chan = 1;
7189				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7190				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
7191				alu.dst.sel = src_gpr;
7192				alu.dst.chan = 1;
7193				alu.dst.write = 1;
7194				alu.last = 1;
7195				r = r600_bytecode_add_alu(ctx->bc, &alu);
7196				if (r)
7197					return r;
7198				/* fall through */
7199
7200			case TGSI_TEXTURE_1D:
7201			case TGSI_TEXTURE_SHADOW1D:
7202			case TGSI_TEXTURE_1D_ARRAY:
7203			case TGSI_TEXTURE_SHADOW1D_ARRAY:
7204				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7205				alu.op = ALU_OP2_ADD_INT;
7206				alu.src[0].sel = src_gpr;
7207				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7208				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
7209				alu.dst.sel = src_gpr;
7210				alu.dst.write = 1;
7211				alu.last = 1;
7212				r = r600_bytecode_add_alu(ctx->bc, &alu);
7213				if (r)
7214					return r;
7215				break;
7216				/* texture offsets do not apply to other texture targets */
7217			}
7218		} else {
7219			switch (inst->Texture.Texture) {
7220			case TGSI_TEXTURE_3D:
7221				offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
7222				/* fallthrough */
7223			case TGSI_TEXTURE_2D:
7224			case TGSI_TEXTURE_SHADOW2D:
7225			case TGSI_TEXTURE_RECT:
7226			case TGSI_TEXTURE_SHADOWRECT:
7227			case TGSI_TEXTURE_2D_ARRAY:
7228			case TGSI_TEXTURE_SHADOW2D_ARRAY:
7229				offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
7230				/* fallthrough */
7231			case TGSI_TEXTURE_1D:
7232			case TGSI_TEXTURE_SHADOW1D:
7233			case TGSI_TEXTURE_1D_ARRAY:
7234			case TGSI_TEXTURE_SHADOW1D_ARRAY:
7235				offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
7236			}
7237		}
7238	}
7239
7240	/* Obtain the sample index for reading a compressed MSAA color texture.
7241	 * To read the FMASK, we use the ldfptr instruction, which tells us
7242	 * where the samples are stored.
7243	 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
7244	 * which is the identity mapping. Each nibble says which physical sample
7245	 * should be fetched to get that sample.
7246	 *
7247	 * Assume src.z contains the sample index. It should be modified like this:
7248	 *   src.z = (ldfptr() >> (src.z * 4)) & 0xF;
7249	 * Then fetch the texel with src.
7250	 */
7251	if (read_compressed_msaa) {
7252		unsigned sample_chan = 3;
7253		unsigned temp = r600_get_temp(ctx);
7254		assert(src_loaded);
7255
7256		/* temp.w = ldfptr() */
7257		memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7258		tex.op = FETCH_OP_LD;
7259		tex.inst_mod = 1; /* to indicate this is ldfptr */
7260		tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7261		tex.sampler_index_mode = sampler_index_mode;
7262		tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7263		tex.resource_index_mode = sampler_index_mode;
7264		tex.src_gpr = src_gpr;
7265		tex.dst_gpr = temp;
7266		tex.dst_sel_x = 7; /* mask out these components */
7267		tex.dst_sel_y = 7;
7268		tex.dst_sel_z = 7;
7269		tex.dst_sel_w = 0; /* store X */
7270		tex.src_sel_x = 0;
7271		tex.src_sel_y = 1;
7272		tex.src_sel_z = 2;
7273		tex.src_sel_w = 3;
7274		tex.offset_x = offset_x;
7275		tex.offset_y = offset_y;
7276		tex.offset_z = offset_z;
7277		r = r600_bytecode_add_tex(ctx->bc, &tex);
7278		if (r)
7279			return r;
7280
7281		/* temp.x = sample_index*4 */
7282		if (ctx->bc->chip_class == CAYMAN) {
7283			for (i = 0 ; i < 4; i++) {
7284				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7285				alu.op = ALU_OP2_MULLO_INT;
7286				alu.src[0].sel = src_gpr;
7287				alu.src[0].chan = sample_chan;
7288				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7289				alu.src[1].value = 4;
7290				alu.dst.sel = temp;
7291				alu.dst.chan = i;
7292				alu.dst.write = i == 0;
7293				if (i == 3)
7294					alu.last = 1;
7295				r = r600_bytecode_add_alu(ctx->bc, &alu);
7296				if (r)
7297					return r;
7298			}
7299		} else {
7300			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7301			alu.op = ALU_OP2_MULLO_INT;
7302			alu.src[0].sel = src_gpr;
7303			alu.src[0].chan = sample_chan;
7304			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7305			alu.src[1].value = 4;
7306			alu.dst.sel = temp;
7307			alu.dst.chan = 0;
7308			alu.dst.write = 1;
7309			alu.last = 1;
7310			r = r600_bytecode_add_alu(ctx->bc, &alu);
7311			if (r)
7312				return r;
7313		}
7314
7315		/* sample_index = temp.w >> temp.x */
7316		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7317		alu.op = ALU_OP2_LSHR_INT;
7318		alu.src[0].sel = temp;
7319		alu.src[0].chan = 3;
7320		alu.src[1].sel = temp;
7321		alu.src[1].chan = 0;
7322		alu.dst.sel = src_gpr;
7323		alu.dst.chan = sample_chan;
7324		alu.dst.write = 1;
7325		alu.last = 1;
7326		r = r600_bytecode_add_alu(ctx->bc, &alu);
7327		if (r)
7328			return r;
7329
7330		/* sample_index & 0xF */
7331		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7332		alu.op = ALU_OP2_AND_INT;
7333		alu.src[0].sel = src_gpr;
7334		alu.src[0].chan = sample_chan;
7335		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7336		alu.src[1].value = 0xF;
7337		alu.dst.sel = src_gpr;
7338		alu.dst.chan = sample_chan;
7339		alu.dst.write = 1;
7340		alu.last = 1;
7341		r = r600_bytecode_add_alu(ctx->bc, &alu);
7342		if (r)
7343			return r;
7344#if 0
7345		/* visualize the FMASK */
7346		for (i = 0; i < 4; i++) {
7347			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7348			alu.op = ALU_OP1_INT_TO_FLT;
7349			alu.src[0].sel = src_gpr;
7350			alu.src[0].chan = sample_chan;
7351			alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7352			alu.dst.chan = i;
7353			alu.dst.write = 1;
7354			alu.last = 1;
7355			r = r600_bytecode_add_alu(ctx->bc, &alu);
7356			if (r)
7357				return r;
7358		}
7359		return 0;
7360#endif
7361	}
7362
7363	/* does this shader want a num layers from TXQ for a cube array? */
7364	if (has_txq_cube_array_z) {
7365		int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7366
7367		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7368		alu.op = ALU_OP1_MOV;
7369
7370		alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
7371		if (ctx->bc->chip_class >= EVERGREEN) {
7372			/* channel 1 or 3 of each word */
7373			alu.src[0].sel += (id / 2);
7374			alu.src[0].chan = ((id % 2) * 2) + 1;
7375		} else {
7376			/* r600 we have them at channel 2 of the second dword */
7377			alu.src[0].sel += (id * 2) + 1;
7378			alu.src[0].chan = 2;
7379		}
7380		alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7381		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
7382		alu.last = 1;
7383		r = r600_bytecode_add_alu(ctx->bc, &alu);
7384		if (r)
7385			return r;
7386		/* disable writemask from texture instruction */
7387		inst->Dst[0].Register.WriteMask &= ~4;
7388	}
7389
7390	opcode = ctx->inst_info->op;
7391	if (opcode == FETCH_OP_GATHER4 &&
7392		inst->TexOffsets[0].File != TGSI_FILE_NULL &&
7393		inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
7394		opcode = FETCH_OP_GATHER4_O;
7395
7396		/* GATHER4_O/GATHER4_C_O use offset values loaded by
7397		   SET_TEXTURE_OFFSETS instruction. The immediate offset values
7398		   encoded in the instruction are ignored. */
7399		memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7400		tex.op = FETCH_OP_SET_TEXTURE_OFFSETS;
7401		tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7402		tex.sampler_index_mode = sampler_index_mode;
7403		tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7404		tex.resource_index_mode = sampler_index_mode;
7405
7406		tex.src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
7407		tex.src_sel_x = inst->TexOffsets[0].SwizzleX;
7408		tex.src_sel_y = inst->TexOffsets[0].SwizzleY;
7409		tex.src_sel_z = inst->TexOffsets[0].SwizzleZ;
7410		tex.src_sel_w = 4;
7411
7412		tex.dst_sel_x = 7;
7413		tex.dst_sel_y = 7;
7414		tex.dst_sel_z = 7;
7415		tex.dst_sel_w = 7;
7416
7417		r = r600_bytecode_add_tex(ctx->bc, &tex);
7418		if (r)
7419			return r;
7420	}
7421
7422	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
7423	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
7424	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
7425	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7426	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
7427	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
7428	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7429		switch (opcode) {
7430		case FETCH_OP_SAMPLE:
7431			opcode = FETCH_OP_SAMPLE_C;
7432			break;
7433		case FETCH_OP_SAMPLE_L:
7434			opcode = FETCH_OP_SAMPLE_C_L;
7435			break;
7436		case FETCH_OP_SAMPLE_LB:
7437			opcode = FETCH_OP_SAMPLE_C_LB;
7438			break;
7439		case FETCH_OP_SAMPLE_G:
7440			opcode = FETCH_OP_SAMPLE_C_G;
7441			break;
7442		/* Texture gather variants */
7443		case FETCH_OP_GATHER4:
7444			opcode = FETCH_OP_GATHER4_C;
7445			break;
7446		case FETCH_OP_GATHER4_O:
7447			opcode = FETCH_OP_GATHER4_C_O;
7448			break;
7449		}
7450	}
7451
7452	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7453	tex.op = opcode;
7454
7455	tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7456	tex.sampler_index_mode = sampler_index_mode;
7457	tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7458	tex.resource_index_mode = sampler_index_mode;
7459	tex.src_gpr = src_gpr;
7460	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7461
7462	if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
7463		inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
7464		tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
7465	}
7466
7467	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
7468		int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
7469		tex.inst_mod = texture_component_select;
7470
7471		if (ctx->bc->chip_class == CAYMAN) {
7472		/* GATHER4 result order is different from TGSI TG4 */
7473			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 0 : 7;
7474			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 1 : 7;
7475			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 2 : 7;
7476			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7477		} else {
7478			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7479			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
7480			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7481			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7482		}
7483	}
7484	else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
7485		tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7486		tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7487		tex.dst_sel_z = 7;
7488		tex.dst_sel_w = 7;
7489	}
7490	else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
7491		tex.dst_sel_x = 3;
7492		tex.dst_sel_y = 7;
7493		tex.dst_sel_z = 7;
7494		tex.dst_sel_w = 7;
7495	}
7496	else {
7497		tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7498		tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7499		tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
7500		tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7501	}
7502
7503
7504	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ||
7505	    inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
7506		tex.src_sel_x = 4;
7507		tex.src_sel_y = 4;
7508		tex.src_sel_z = 4;
7509		tex.src_sel_w = 4;
7510	} else if (src_loaded) {
7511		tex.src_sel_x = 0;
7512		tex.src_sel_y = 1;
7513		tex.src_sel_z = 2;
7514		tex.src_sel_w = 3;
7515	} else {
7516		tex.src_sel_x = ctx->src[0].swizzle[0];
7517		tex.src_sel_y = ctx->src[0].swizzle[1];
7518		tex.src_sel_z = ctx->src[0].swizzle[2];
7519		tex.src_sel_w = ctx->src[0].swizzle[3];
7520		tex.src_rel = ctx->src[0].rel;
7521	}
7522
7523	if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
7524	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7525	    inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7526	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7527		tex.src_sel_x = 1;
7528		tex.src_sel_y = 0;
7529		tex.src_sel_z = 3;
7530		tex.src_sel_w = 2; /* route Z compare or Lod value into W */
7531	}
7532
7533	if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
7534	    inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
7535		tex.coord_type_x = 1;
7536		tex.coord_type_y = 1;
7537	}
7538	tex.coord_type_z = 1;
7539	tex.coord_type_w = 1;
7540
7541	tex.offset_x = offset_x;
7542	tex.offset_y = offset_y;
7543	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
7544		(inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
7545		 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
7546		tex.offset_z = 0;
7547	}
7548	else {
7549		tex.offset_z = offset_z;
7550	}
7551
7552	/* Put the depth for comparison in W.
7553	 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
7554	 * Some instructions expect the depth in Z. */
7555	if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
7556	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
7557	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
7558	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
7559	    opcode != FETCH_OP_SAMPLE_C_L &&
7560	    opcode != FETCH_OP_SAMPLE_C_LB) {
7561		tex.src_sel_w = tex.src_sel_z;
7562	}
7563
7564	if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
7565	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
7566		if (opcode == FETCH_OP_SAMPLE_C_L ||
7567		    opcode == FETCH_OP_SAMPLE_C_LB) {
7568			/* the array index is read from Y */
7569			tex.coord_type_y = 0;
7570		} else {
7571			/* the array index is read from Z */
7572			tex.coord_type_z = 0;
7573			tex.src_sel_z = tex.src_sel_y;
7574		}
7575	} else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
7576		   inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
7577		   ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7578		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
7579		    (ctx->bc->chip_class >= EVERGREEN)))
7580		/* the array index is read from Z */
7581		tex.coord_type_z = 0;
7582
7583	/* mask unused source components */
7584	if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
7585		switch (inst->Texture.Texture) {
7586		case TGSI_TEXTURE_2D:
7587		case TGSI_TEXTURE_RECT:
7588			tex.src_sel_z = 7;
7589			tex.src_sel_w = 7;
7590			break;
7591		case TGSI_TEXTURE_1D_ARRAY:
7592			tex.src_sel_y = 7;
7593			tex.src_sel_w = 7;
7594			break;
7595		case TGSI_TEXTURE_1D:
7596			tex.src_sel_y = 7;
7597			tex.src_sel_z = 7;
7598			tex.src_sel_w = 7;
7599			break;
7600		}
7601	}
7602
7603	r = r600_bytecode_add_tex(ctx->bc, &tex);
7604	if (r)
7605		return r;
7606
7607	/* add shadow ambient support  - gallium doesn't do it yet */
7608	return 0;
7609}
7610
7611static int tgsi_lrp(struct r600_shader_ctx *ctx)
7612{
7613	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7614	struct r600_bytecode_alu alu;
7615	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7616	unsigned i, temp_regs[2];
7617	int r;
7618
7619	/* optimize if it's just an equal balance */
7620	if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
7621		for (i = 0; i < lasti + 1; i++) {
7622			if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7623				continue;
7624
7625			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7626			alu.op = ALU_OP2_ADD;
7627			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
7628			r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
7629			alu.omod = 3;
7630			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7631			alu.dst.chan = i;
7632			if (i == lasti) {
7633				alu.last = 1;
7634			}
7635			r = r600_bytecode_add_alu(ctx->bc, &alu);
7636			if (r)
7637				return r;
7638		}
7639		return 0;
7640	}
7641
7642	/* 1 - src0 */
7643	for (i = 0; i < lasti + 1; i++) {
7644		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7645			continue;
7646
7647		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7648		alu.op = ALU_OP2_ADD;
7649		alu.src[0].sel = V_SQ_ALU_SRC_1;
7650		alu.src[0].chan = 0;
7651		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
7652		r600_bytecode_src_toggle_neg(&alu.src[1]);
7653		alu.dst.sel = ctx->temp_reg;
7654		alu.dst.chan = i;
7655		if (i == lasti) {
7656			alu.last = 1;
7657		}
7658		alu.dst.write = 1;
7659		r = r600_bytecode_add_alu(ctx->bc, &alu);
7660		if (r)
7661			return r;
7662	}
7663
7664	/* (1 - src0) * src2 */
7665	for (i = 0; i < lasti + 1; i++) {
7666		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7667			continue;
7668
7669		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7670		alu.op = ALU_OP2_MUL;
7671		alu.src[0].sel = ctx->temp_reg;
7672		alu.src[0].chan = i;
7673		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
7674		alu.dst.sel = ctx->temp_reg;
7675		alu.dst.chan = i;
7676		if (i == lasti) {
7677			alu.last = 1;
7678		}
7679		alu.dst.write = 1;
7680		r = r600_bytecode_add_alu(ctx->bc, &alu);
7681		if (r)
7682			return r;
7683	}
7684
7685	/* src0 * src1 + (1 - src0) * src2 */
7686        if (ctx->src[0].abs)
7687		temp_regs[0] = r600_get_temp(ctx);
7688	else
7689		temp_regs[0] = 0;
7690	if (ctx->src[1].abs)
7691		temp_regs[1] = r600_get_temp(ctx);
7692	else
7693		temp_regs[1] = 0;
7694
7695	for (i = 0; i < lasti + 1; i++) {
7696		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7697			continue;
7698
7699		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7700		alu.op = ALU_OP3_MULADD;
7701		alu.is_op3 = 1;
7702		r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
7703		if (r)
7704			return r;
7705		r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[1], &ctx->src[1]);
7706		if (r)
7707			return r;
7708		alu.src[2].sel = ctx->temp_reg;
7709		alu.src[2].chan = i;
7710
7711		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7712		alu.dst.chan = i;
7713		if (i == lasti) {
7714			alu.last = 1;
7715		}
7716		r = r600_bytecode_add_alu(ctx->bc, &alu);
7717		if (r)
7718			return r;
7719	}
7720	return 0;
7721}
7722
7723static int tgsi_cmp(struct r600_shader_ctx *ctx)
7724{
7725	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7726	struct r600_bytecode_alu alu;
7727	int i, r, j;
7728	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7729	int temp_regs[3];
7730
7731	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7732		temp_regs[j] = 0;
7733		if (ctx->src[j].abs)
7734			temp_regs[j] = r600_get_temp(ctx);
7735	}
7736
7737	for (i = 0; i < lasti + 1; i++) {
7738		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7739			continue;
7740
7741		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7742		alu.op = ALU_OP3_CNDGE;
7743		r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
7744		if (r)
7745			return r;
7746		r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[1], &ctx->src[2]);
7747		if (r)
7748			return r;
7749		r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[2], &ctx->src[1]);
7750		if (r)
7751			return r;
7752		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7753		alu.dst.chan = i;
7754		alu.dst.write = 1;
7755		alu.is_op3 = 1;
7756		if (i == lasti)
7757			alu.last = 1;
7758		r = r600_bytecode_add_alu(ctx->bc, &alu);
7759		if (r)
7760			return r;
7761	}
7762	return 0;
7763}
7764
7765static int tgsi_ucmp(struct r600_shader_ctx *ctx)
7766{
7767	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7768	struct r600_bytecode_alu alu;
7769	int i, r;
7770	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7771
7772	for (i = 0; i < lasti + 1; i++) {
7773		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7774			continue;
7775
7776		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7777		alu.op = ALU_OP3_CNDE_INT;
7778		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7779		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
7780		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
7781		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7782		alu.dst.chan = i;
7783		alu.dst.write = 1;
7784		alu.is_op3 = 1;
7785		if (i == lasti)
7786			alu.last = 1;
7787		r = r600_bytecode_add_alu(ctx->bc, &alu);
7788		if (r)
7789			return r;
7790	}
7791	return 0;
7792}
7793
7794static int tgsi_xpd(struct r600_shader_ctx *ctx)
7795{
7796	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7797	static const unsigned int src0_swizzle[] = {2, 0, 1};
7798	static const unsigned int src1_swizzle[] = {1, 2, 0};
7799	struct r600_bytecode_alu alu;
7800	uint32_t use_temp = 0;
7801	int i, r;
7802
7803	if (inst->Dst[0].Register.WriteMask != 0xf)
7804		use_temp = 1;
7805
7806	for (i = 0; i < 4; i++) {
7807		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7808		alu.op = ALU_OP2_MUL;
7809		if (i < 3) {
7810			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
7811			r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
7812		} else {
7813			alu.src[0].sel = V_SQ_ALU_SRC_0;
7814			alu.src[0].chan = i;
7815			alu.src[1].sel = V_SQ_ALU_SRC_0;
7816			alu.src[1].chan = i;
7817		}
7818
7819		alu.dst.sel = ctx->temp_reg;
7820		alu.dst.chan = i;
7821		alu.dst.write = 1;
7822
7823		if (i == 3)
7824			alu.last = 1;
7825		r = r600_bytecode_add_alu(ctx->bc, &alu);
7826		if (r)
7827			return r;
7828	}
7829
7830	for (i = 0; i < 4; i++) {
7831		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7832		alu.op = ALU_OP3_MULADD;
7833
7834		if (i < 3) {
7835			r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
7836			r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
7837		} else {
7838			alu.src[0].sel = V_SQ_ALU_SRC_0;
7839			alu.src[0].chan = i;
7840			alu.src[1].sel = V_SQ_ALU_SRC_0;
7841			alu.src[1].chan = i;
7842		}
7843
7844		alu.src[2].sel = ctx->temp_reg;
7845		alu.src[2].neg = 1;
7846		alu.src[2].chan = i;
7847
7848		if (use_temp)
7849			alu.dst.sel = ctx->temp_reg;
7850		else
7851			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7852		alu.dst.chan = i;
7853		alu.dst.write = 1;
7854		alu.is_op3 = 1;
7855		if (i == 3)
7856			alu.last = 1;
7857		r = r600_bytecode_add_alu(ctx->bc, &alu);
7858		if (r)
7859			return r;
7860	}
7861	if (use_temp)
7862		return tgsi_helper_copy(ctx, inst);
7863	return 0;
7864}
7865
7866static int tgsi_exp(struct r600_shader_ctx *ctx)
7867{
7868	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7869	struct r600_bytecode_alu alu;
7870	int r;
7871	int i;
7872
7873	/* result.x = 2^floor(src); */
7874	if (inst->Dst[0].Register.WriteMask & 1) {
7875		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7876
7877		alu.op = ALU_OP1_FLOOR;
7878		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7879
7880		alu.dst.sel = ctx->temp_reg;
7881		alu.dst.chan = 0;
7882		alu.dst.write = 1;
7883		alu.last = 1;
7884		r = r600_bytecode_add_alu(ctx->bc, &alu);
7885		if (r)
7886			return r;
7887
7888		if (ctx->bc->chip_class == CAYMAN) {
7889			for (i = 0; i < 3; i++) {
7890				alu.op = ALU_OP1_EXP_IEEE;
7891				alu.src[0].sel = ctx->temp_reg;
7892				alu.src[0].chan = 0;
7893
7894				alu.dst.sel = ctx->temp_reg;
7895				alu.dst.chan = i;
7896				alu.dst.write = i == 0;
7897				alu.last = i == 2;
7898				r = r600_bytecode_add_alu(ctx->bc, &alu);
7899				if (r)
7900					return r;
7901			}
7902		} else {
7903			alu.op = ALU_OP1_EXP_IEEE;
7904			alu.src[0].sel = ctx->temp_reg;
7905			alu.src[0].chan = 0;
7906
7907			alu.dst.sel = ctx->temp_reg;
7908			alu.dst.chan = 0;
7909			alu.dst.write = 1;
7910			alu.last = 1;
7911			r = r600_bytecode_add_alu(ctx->bc, &alu);
7912			if (r)
7913				return r;
7914		}
7915	}
7916
7917	/* result.y = tmp - floor(tmp); */
7918	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
7919		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7920
7921		alu.op = ALU_OP1_FRACT;
7922		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7923
7924		alu.dst.sel = ctx->temp_reg;
7925#if 0
7926		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7927		if (r)
7928			return r;
7929#endif
7930		alu.dst.write = 1;
7931		alu.dst.chan = 1;
7932
7933		alu.last = 1;
7934
7935		r = r600_bytecode_add_alu(ctx->bc, &alu);
7936		if (r)
7937			return r;
7938	}
7939
7940	/* result.z = RoughApprox2ToX(tmp);*/
7941	if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
7942		if (ctx->bc->chip_class == CAYMAN) {
7943			for (i = 0; i < 3; i++) {
7944				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7945				alu.op = ALU_OP1_EXP_IEEE;
7946				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7947
7948				alu.dst.sel = ctx->temp_reg;
7949				alu.dst.chan = i;
7950				if (i == 2) {
7951					alu.dst.write = 1;
7952					alu.last = 1;
7953				}
7954
7955				r = r600_bytecode_add_alu(ctx->bc, &alu);
7956				if (r)
7957					return r;
7958			}
7959		} else {
7960			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7961			alu.op = ALU_OP1_EXP_IEEE;
7962			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7963
7964			alu.dst.sel = ctx->temp_reg;
7965			alu.dst.write = 1;
7966			alu.dst.chan = 2;
7967
7968			alu.last = 1;
7969
7970			r = r600_bytecode_add_alu(ctx->bc, &alu);
7971			if (r)
7972				return r;
7973		}
7974	}
7975
7976	/* result.w = 1.0;*/
7977	if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
7978		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7979
7980		alu.op = ALU_OP1_MOV;
7981		alu.src[0].sel = V_SQ_ALU_SRC_1;
7982		alu.src[0].chan = 0;
7983
7984		alu.dst.sel = ctx->temp_reg;
7985		alu.dst.chan = 3;
7986		alu.dst.write = 1;
7987		alu.last = 1;
7988		r = r600_bytecode_add_alu(ctx->bc, &alu);
7989		if (r)
7990			return r;
7991	}
7992	return tgsi_helper_copy(ctx, inst);
7993}
7994
7995static int tgsi_log(struct r600_shader_ctx *ctx)
7996{
7997	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7998	struct r600_bytecode_alu alu;
7999	int r;
8000	int i;
8001
8002	/* result.x = floor(log2(|src|)); */
8003	if (inst->Dst[0].Register.WriteMask & 1) {
8004		if (ctx->bc->chip_class == CAYMAN) {
8005			for (i = 0; i < 3; i++) {
8006				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8007
8008				alu.op = ALU_OP1_LOG_IEEE;
8009				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8010				r600_bytecode_src_set_abs(&alu.src[0]);
8011
8012				alu.dst.sel = ctx->temp_reg;
8013				alu.dst.chan = i;
8014				if (i == 0)
8015					alu.dst.write = 1;
8016				if (i == 2)
8017					alu.last = 1;
8018				r = r600_bytecode_add_alu(ctx->bc, &alu);
8019				if (r)
8020					return r;
8021			}
8022
8023		} else {
8024			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8025
8026			alu.op = ALU_OP1_LOG_IEEE;
8027			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8028			r600_bytecode_src_set_abs(&alu.src[0]);
8029
8030			alu.dst.sel = ctx->temp_reg;
8031			alu.dst.chan = 0;
8032			alu.dst.write = 1;
8033			alu.last = 1;
8034			r = r600_bytecode_add_alu(ctx->bc, &alu);
8035			if (r)
8036				return r;
8037		}
8038
8039		alu.op = ALU_OP1_FLOOR;
8040		alu.src[0].sel = ctx->temp_reg;
8041		alu.src[0].chan = 0;
8042
8043		alu.dst.sel = ctx->temp_reg;
8044		alu.dst.chan = 0;
8045		alu.dst.write = 1;
8046		alu.last = 1;
8047
8048		r = r600_bytecode_add_alu(ctx->bc, &alu);
8049		if (r)
8050			return r;
8051	}
8052
8053	/* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
8054	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
8055
8056		if (ctx->bc->chip_class == CAYMAN) {
8057			for (i = 0; i < 3; i++) {
8058				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8059
8060				alu.op = ALU_OP1_LOG_IEEE;
8061				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8062				r600_bytecode_src_set_abs(&alu.src[0]);
8063
8064				alu.dst.sel = ctx->temp_reg;
8065				alu.dst.chan = i;
8066				if (i == 1)
8067					alu.dst.write = 1;
8068				if (i == 2)
8069					alu.last = 1;
8070
8071				r = r600_bytecode_add_alu(ctx->bc, &alu);
8072				if (r)
8073					return r;
8074			}
8075		} else {
8076			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8077
8078			alu.op = ALU_OP1_LOG_IEEE;
8079			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8080			r600_bytecode_src_set_abs(&alu.src[0]);
8081
8082			alu.dst.sel = ctx->temp_reg;
8083			alu.dst.chan = 1;
8084			alu.dst.write = 1;
8085			alu.last = 1;
8086
8087			r = r600_bytecode_add_alu(ctx->bc, &alu);
8088			if (r)
8089				return r;
8090		}
8091
8092		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8093
8094		alu.op = ALU_OP1_FLOOR;
8095		alu.src[0].sel = ctx->temp_reg;
8096		alu.src[0].chan = 1;
8097
8098		alu.dst.sel = ctx->temp_reg;
8099		alu.dst.chan = 1;
8100		alu.dst.write = 1;
8101		alu.last = 1;
8102
8103		r = r600_bytecode_add_alu(ctx->bc, &alu);
8104		if (r)
8105			return r;
8106
8107		if (ctx->bc->chip_class == CAYMAN) {
8108			for (i = 0; i < 3; i++) {
8109				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8110				alu.op = ALU_OP1_EXP_IEEE;
8111				alu.src[0].sel = ctx->temp_reg;
8112				alu.src[0].chan = 1;
8113
8114				alu.dst.sel = ctx->temp_reg;
8115				alu.dst.chan = i;
8116				if (i == 1)
8117					alu.dst.write = 1;
8118				if (i == 2)
8119					alu.last = 1;
8120
8121				r = r600_bytecode_add_alu(ctx->bc, &alu);
8122				if (r)
8123					return r;
8124			}
8125		} else {
8126			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8127			alu.op = ALU_OP1_EXP_IEEE;
8128			alu.src[0].sel = ctx->temp_reg;
8129			alu.src[0].chan = 1;
8130
8131			alu.dst.sel = ctx->temp_reg;
8132			alu.dst.chan = 1;
8133			alu.dst.write = 1;
8134			alu.last = 1;
8135
8136			r = r600_bytecode_add_alu(ctx->bc, &alu);
8137			if (r)
8138				return r;
8139		}
8140
8141		if (ctx->bc->chip_class == CAYMAN) {
8142			for (i = 0; i < 3; i++) {
8143				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8144				alu.op = ALU_OP1_RECIP_IEEE;
8145				alu.src[0].sel = ctx->temp_reg;
8146				alu.src[0].chan = 1;
8147
8148				alu.dst.sel = ctx->temp_reg;
8149				alu.dst.chan = i;
8150				if (i == 1)
8151					alu.dst.write = 1;
8152				if (i == 2)
8153					alu.last = 1;
8154
8155				r = r600_bytecode_add_alu(ctx->bc, &alu);
8156				if (r)
8157					return r;
8158			}
8159		} else {
8160			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8161			alu.op = ALU_OP1_RECIP_IEEE;
8162			alu.src[0].sel = ctx->temp_reg;
8163			alu.src[0].chan = 1;
8164
8165			alu.dst.sel = ctx->temp_reg;
8166			alu.dst.chan = 1;
8167			alu.dst.write = 1;
8168			alu.last = 1;
8169
8170			r = r600_bytecode_add_alu(ctx->bc, &alu);
8171			if (r)
8172				return r;
8173		}
8174
8175		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8176
8177		alu.op = ALU_OP2_MUL;
8178
8179		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8180		r600_bytecode_src_set_abs(&alu.src[0]);
8181
8182		alu.src[1].sel = ctx->temp_reg;
8183		alu.src[1].chan = 1;
8184
8185		alu.dst.sel = ctx->temp_reg;
8186		alu.dst.chan = 1;
8187		alu.dst.write = 1;
8188		alu.last = 1;
8189
8190		r = r600_bytecode_add_alu(ctx->bc, &alu);
8191		if (r)
8192			return r;
8193	}
8194
8195	/* result.z = log2(|src|);*/
8196	if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
8197		if (ctx->bc->chip_class == CAYMAN) {
8198			for (i = 0; i < 3; i++) {
8199				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8200
8201				alu.op = ALU_OP1_LOG_IEEE;
8202				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8203				r600_bytecode_src_set_abs(&alu.src[0]);
8204
8205				alu.dst.sel = ctx->temp_reg;
8206				if (i == 2)
8207					alu.dst.write = 1;
8208				alu.dst.chan = i;
8209				if (i == 2)
8210					alu.last = 1;
8211
8212				r = r600_bytecode_add_alu(ctx->bc, &alu);
8213				if (r)
8214					return r;
8215			}
8216		} else {
8217			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8218
8219			alu.op = ALU_OP1_LOG_IEEE;
8220			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8221			r600_bytecode_src_set_abs(&alu.src[0]);
8222
8223			alu.dst.sel = ctx->temp_reg;
8224			alu.dst.write = 1;
8225			alu.dst.chan = 2;
8226			alu.last = 1;
8227
8228			r = r600_bytecode_add_alu(ctx->bc, &alu);
8229			if (r)
8230				return r;
8231		}
8232	}
8233
8234	/* result.w = 1.0; */
8235	if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
8236		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8237
8238		alu.op = ALU_OP1_MOV;
8239		alu.src[0].sel = V_SQ_ALU_SRC_1;
8240		alu.src[0].chan = 0;
8241
8242		alu.dst.sel = ctx->temp_reg;
8243		alu.dst.chan = 3;
8244		alu.dst.write = 1;
8245		alu.last = 1;
8246
8247		r = r600_bytecode_add_alu(ctx->bc, &alu);
8248		if (r)
8249			return r;
8250	}
8251
8252	return tgsi_helper_copy(ctx, inst);
8253}
8254
8255static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
8256{
8257	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8258	struct r600_bytecode_alu alu;
8259	int r;
8260	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8261	unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
8262
8263	assert(inst->Dst[0].Register.Index < 3);
8264	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8265
8266	switch (inst->Instruction.Opcode) {
8267	case TGSI_OPCODE_ARL:
8268		alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
8269		break;
8270	case TGSI_OPCODE_ARR:
8271		alu.op = ALU_OP1_FLT_TO_INT;
8272		break;
8273	case TGSI_OPCODE_UARL:
8274		alu.op = ALU_OP1_MOV;
8275		break;
8276	default:
8277		assert(0);
8278		return -1;
8279	}
8280
8281	for (i = 0; i <= lasti; ++i) {
8282		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8283			continue;
8284		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8285		alu.last = i == lasti;
8286		alu.dst.sel = reg;
8287	        alu.dst.chan = i;
8288		alu.dst.write = 1;
8289		r = r600_bytecode_add_alu(ctx->bc, &alu);
8290		if (r)
8291			return r;
8292	}
8293
8294	if (inst->Dst[0].Register.Index > 0)
8295		ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
8296	else
8297		ctx->bc->ar_loaded = 0;
8298
8299	return 0;
8300}
8301static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
8302{
8303	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8304	struct r600_bytecode_alu alu;
8305	int r;
8306	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8307
8308	switch (inst->Instruction.Opcode) {
8309	case TGSI_OPCODE_ARL:
8310		memset(&alu, 0, sizeof(alu));
8311		alu.op = ALU_OP1_FLOOR;
8312		alu.dst.sel = ctx->bc->ar_reg;
8313		alu.dst.write = 1;
8314		for (i = 0; i <= lasti; ++i) {
8315			if (inst->Dst[0].Register.WriteMask & (1 << i))  {
8316				alu.dst.chan = i;
8317				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8318				alu.last = i == lasti;
8319				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
8320					return r;
8321			}
8322		}
8323
8324		memset(&alu, 0, sizeof(alu));
8325		alu.op = ALU_OP1_FLT_TO_INT;
8326		alu.src[0].sel = ctx->bc->ar_reg;
8327		alu.dst.sel = ctx->bc->ar_reg;
8328		alu.dst.write = 1;
8329		/* FLT_TO_INT is trans-only on r600/r700 */
8330		alu.last = TRUE;
8331		for (i = 0; i <= lasti; ++i) {
8332			alu.dst.chan = i;
8333			alu.src[0].chan = i;
8334			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
8335				return r;
8336		}
8337		break;
8338	case TGSI_OPCODE_ARR:
8339		memset(&alu, 0, sizeof(alu));
8340		alu.op = ALU_OP1_FLT_TO_INT;
8341		alu.dst.sel = ctx->bc->ar_reg;
8342		alu.dst.write = 1;
8343		/* FLT_TO_INT is trans-only on r600/r700 */
8344		alu.last = TRUE;
8345		for (i = 0; i <= lasti; ++i) {
8346			if (inst->Dst[0].Register.WriteMask & (1 << i)) {
8347				alu.dst.chan = i;
8348				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8349				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
8350					return r;
8351			}
8352		}
8353		break;
8354	case TGSI_OPCODE_UARL:
8355		memset(&alu, 0, sizeof(alu));
8356		alu.op = ALU_OP1_MOV;
8357		alu.dst.sel = ctx->bc->ar_reg;
8358		alu.dst.write = 1;
8359		for (i = 0; i <= lasti; ++i) {
8360			if (inst->Dst[0].Register.WriteMask & (1 << i)) {
8361				alu.dst.chan = i;
8362				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8363				alu.last = i == lasti;
8364				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
8365					return r;
8366			}
8367		}
8368		break;
8369	default:
8370		assert(0);
8371		return -1;
8372	}
8373
8374	ctx->bc->ar_loaded = 0;
8375	return 0;
8376}
8377
8378static int tgsi_opdst(struct r600_shader_ctx *ctx)
8379{
8380	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8381	struct r600_bytecode_alu alu;
8382	int i, r = 0;
8383
8384	for (i = 0; i < 4; i++) {
8385		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8386
8387		alu.op = ALU_OP2_MUL;
8388		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8389
8390		if (i == 0 || i == 3) {
8391			alu.src[0].sel = V_SQ_ALU_SRC_1;
8392		} else {
8393			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8394		}
8395
8396		if (i == 0 || i == 2) {
8397			alu.src[1].sel = V_SQ_ALU_SRC_1;
8398		} else {
8399			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
8400		}
8401		if (i == 3)
8402			alu.last = 1;
8403		r = r600_bytecode_add_alu(ctx->bc, &alu);
8404		if (r)
8405			return r;
8406	}
8407	return 0;
8408}
8409
8410static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type)
8411{
8412	struct r600_bytecode_alu alu;
8413	int r;
8414
8415	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8416	alu.op = opcode;
8417	alu.execute_mask = 1;
8418	alu.update_pred = 1;
8419
8420	alu.dst.sel = ctx->temp_reg;
8421	alu.dst.write = 1;
8422	alu.dst.chan = 0;
8423
8424	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8425	alu.src[1].sel = V_SQ_ALU_SRC_0;
8426	alu.src[1].chan = 0;
8427
8428	alu.last = 1;
8429
8430	r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
8431	if (r)
8432		return r;
8433	return 0;
8434}
8435
8436static int pops(struct r600_shader_ctx *ctx, int pops)
8437{
8438	unsigned force_pop = ctx->bc->force_add_cf;
8439
8440	if (!force_pop) {
8441		int alu_pop = 3;
8442		if (ctx->bc->cf_last) {
8443			if (ctx->bc->cf_last->op == CF_OP_ALU)
8444				alu_pop = 0;
8445			else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
8446				alu_pop = 1;
8447		}
8448		alu_pop += pops;
8449		if (alu_pop == 1) {
8450			ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
8451			ctx->bc->force_add_cf = 1;
8452		} else if (alu_pop == 2) {
8453			ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
8454			ctx->bc->force_add_cf = 1;
8455		} else {
8456			force_pop = 1;
8457		}
8458	}
8459
8460	if (force_pop) {
8461		r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
8462		ctx->bc->cf_last->pop_count = pops;
8463		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
8464	}
8465
8466	return 0;
8467}
8468
8469static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
8470                                              unsigned reason)
8471{
8472	struct r600_stack_info *stack = &ctx->bc->stack;
8473	unsigned elements, entries;
8474
8475	unsigned entry_size = stack->entry_size;
8476
8477	elements = (stack->loop + stack->push_wqm ) * entry_size;
8478	elements += stack->push;
8479
8480	switch (ctx->bc->chip_class) {
8481	case R600:
8482	case R700:
8483		/* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
8484		 * the stack must be reserved to hold the current active/continue
8485		 * masks */
8486		if (reason == FC_PUSH_VPM) {
8487			elements += 2;
8488		}
8489		break;
8490
8491	case CAYMAN:
8492		/* r9xx: any stack operation on empty stack consumes 2 additional
8493		 * elements */
8494		elements += 2;
8495
8496		/* fallthrough */
8497		/* FIXME: do the two elements added above cover the cases for the
8498		 * r8xx+ below? */
8499
8500	case EVERGREEN:
8501		/* r8xx+: 2 extra elements are not always required, but one extra
8502		 * element must be added for each of the following cases:
8503		 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
8504		 *    stack usage.
8505		 *    (Currently we don't use ALU_ELSE_AFTER.)
8506		 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
8507		 *    PUSH instruction executed.
8508		 *
8509		 *    NOTE: it seems we also need to reserve additional element in some
8510		 *    other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
8511		 *    then STACK_SIZE should be 2 instead of 1 */
8512		if (reason == FC_PUSH_VPM) {
8513			elements += 1;
8514		}
8515		break;
8516
8517	default:
8518		assert(0);
8519		break;
8520	}
8521
8522	/* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
8523	 * for all chips, so we use 4 in the final formula, not the real entry_size
8524	 * for the chip */
8525	entry_size = 4;
8526
8527	entries = (elements + (entry_size - 1)) / entry_size;
8528
8529	if (entries > stack->max_entries)
8530		stack->max_entries = entries;
8531}
8532
8533static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
8534{
8535	switch(reason) {
8536	case FC_PUSH_VPM:
8537		--ctx->bc->stack.push;
8538		assert(ctx->bc->stack.push >= 0);
8539		break;
8540	case FC_PUSH_WQM:
8541		--ctx->bc->stack.push_wqm;
8542		assert(ctx->bc->stack.push_wqm >= 0);
8543		break;
8544	case FC_LOOP:
8545		--ctx->bc->stack.loop;
8546		assert(ctx->bc->stack.loop >= 0);
8547		break;
8548	default:
8549		assert(0);
8550		break;
8551	}
8552}
8553
8554static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
8555{
8556	switch (reason) {
8557	case FC_PUSH_VPM:
8558		++ctx->bc->stack.push;
8559		break;
8560	case FC_PUSH_WQM:
8561		++ctx->bc->stack.push_wqm;
8562	case FC_LOOP:
8563		++ctx->bc->stack.loop;
8564		break;
8565	default:
8566		assert(0);
8567	}
8568
8569	callstack_update_max_depth(ctx, reason);
8570}
8571
8572static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
8573{
8574	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
8575
8576	sp->mid = realloc((void *)sp->mid,
8577						sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
8578	sp->mid[sp->num_mid] = ctx->bc->cf_last;
8579	sp->num_mid++;
8580}
8581
8582static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
8583{
8584	ctx->bc->fc_sp++;
8585	ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
8586	ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
8587}
8588
8589static void fc_poplevel(struct r600_shader_ctx *ctx)
8590{
8591	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
8592	free(sp->mid);
8593	sp->mid = NULL;
8594	sp->num_mid = 0;
8595	sp->start = NULL;
8596	sp->type = 0;
8597	ctx->bc->fc_sp--;
8598}
8599
8600#if 0
8601static int emit_return(struct r600_shader_ctx *ctx)
8602{
8603	r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
8604	return 0;
8605}
8606
8607static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
8608{
8609
8610	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
8611	ctx->bc->cf_last->pop_count = pops;
8612	/* XXX work out offset */
8613	return 0;
8614}
8615
8616static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
8617{
8618	return 0;
8619}
8620
8621static void emit_testflag(struct r600_shader_ctx *ctx)
8622{
8623
8624}
8625
8626static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
8627{
8628	emit_testflag(ctx);
8629	emit_jump_to_offset(ctx, 1, 4);
8630	emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
8631	pops(ctx, ifidx + 1);
8632	emit_return(ctx);
8633}
8634
8635static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
8636{
8637	emit_testflag(ctx);
8638
8639	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
8640	ctx->bc->cf_last->pop_count = 1;
8641
8642	fc_set_mid(ctx, fc_sp);
8643
8644	pops(ctx, 1);
8645}
8646#endif
8647
8648static int emit_if(struct r600_shader_ctx *ctx, int opcode)
8649{
8650	int alu_type = CF_OP_ALU_PUSH_BEFORE;
8651
8652	/* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
8653	 * LOOP_STARTxxx for nested loops may put the branch stack into a state
8654	 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
8655	 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
8656	if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) {
8657		r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
8658		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
8659		alu_type = CF_OP_ALU;
8660	}
8661
8662	emit_logic_pred(ctx, opcode, alu_type);
8663
8664	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
8665
8666	fc_pushlevel(ctx, FC_IF);
8667
8668	callstack_push(ctx, FC_PUSH_VPM);
8669	return 0;
8670}
8671
8672static int tgsi_if(struct r600_shader_ctx *ctx)
8673{
8674	return emit_if(ctx, ALU_OP2_PRED_SETNE);
8675}
8676
8677static int tgsi_uif(struct r600_shader_ctx *ctx)
8678{
8679	return emit_if(ctx, ALU_OP2_PRED_SETNE_INT);
8680}
8681
8682static int tgsi_else(struct r600_shader_ctx *ctx)
8683{
8684	r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
8685	ctx->bc->cf_last->pop_count = 1;
8686
8687	fc_set_mid(ctx, ctx->bc->fc_sp);
8688	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
8689	return 0;
8690}
8691
8692static int tgsi_endif(struct r600_shader_ctx *ctx)
8693{
8694	pops(ctx, 1);
8695	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
8696		R600_ERR("if/endif unbalanced in shader\n");
8697		return -1;
8698	}
8699
8700	if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
8701		ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
8702		ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
8703	} else {
8704		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
8705	}
8706	fc_poplevel(ctx);
8707
8708	callstack_pop(ctx, FC_PUSH_VPM);
8709	return 0;
8710}
8711
8712static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
8713{
8714	/* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
8715	 * limited to 4096 iterations, like the other LOOP_* instructions. */
8716	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
8717
8718	fc_pushlevel(ctx, FC_LOOP);
8719
8720	/* check stack depth */
8721	callstack_push(ctx, FC_LOOP);
8722	return 0;
8723}
8724
8725static int tgsi_endloop(struct r600_shader_ctx *ctx)
8726{
8727	int i;
8728
8729	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
8730
8731	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
8732		R600_ERR("loop/endloop in shader code are not paired.\n");
8733		return -EINVAL;
8734	}
8735
8736	/* fixup loop pointers - from r600isa
8737	   LOOP END points to CF after LOOP START,
8738	   LOOP START point to CF after LOOP END
8739	   BRK/CONT point to LOOP END CF
8740	*/
8741	ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
8742
8743	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
8744
8745	for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
8746		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
8747	}
8748	/* XXX add LOOPRET support */
8749	fc_poplevel(ctx);
8750	callstack_pop(ctx, FC_LOOP);
8751	return 0;
8752}
8753
8754static int tgsi_loop_breakc(struct r600_shader_ctx *ctx)
8755{
8756	int r;
8757	unsigned int fscp;
8758
8759	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
8760	{
8761		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
8762			break;
8763	}
8764	if (fscp == 0) {
8765		R600_ERR("BREAKC not inside loop/endloop pair\n");
8766		return -EINVAL;
8767	}
8768
8769	if (ctx->bc->chip_class == EVERGREEN &&
8770	    ctx->bc->family != CHIP_CYPRESS &&
8771	    ctx->bc->family != CHIP_JUNIPER) {
8772		/* HW bug: ALU_BREAK does not save the active mask correctly */
8773		r = tgsi_uif(ctx);
8774		if (r)
8775			return r;
8776
8777		r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_BREAK);
8778		if (r)
8779			return r;
8780		fc_set_mid(ctx, fscp);
8781
8782		return tgsi_endif(ctx);
8783	} else {
8784		r = emit_logic_pred(ctx, ALU_OP2_PRED_SETE_INT, CF_OP_ALU_BREAK);
8785		if (r)
8786			return r;
8787		fc_set_mid(ctx, fscp);
8788	}
8789
8790	return 0;
8791}
8792
8793static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
8794{
8795	unsigned int fscp;
8796
8797	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
8798	{
8799		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
8800			break;
8801	}
8802
8803	if (fscp == 0) {
8804		R600_ERR("Break not inside loop/endloop pair\n");
8805		return -EINVAL;
8806	}
8807
8808	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
8809
8810	fc_set_mid(ctx, fscp);
8811
8812	return 0;
8813}
8814
8815static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
8816{
8817	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8818	int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
8819	int r;
8820
8821	if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
8822		emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
8823
8824	r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
8825	if (!r) {
8826		ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
8827		return emit_inc_ring_offset(ctx, stream, TRUE);
8828	}
8829	return r;
8830}
8831
8832static int tgsi_umad(struct r600_shader_ctx *ctx)
8833{
8834	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8835	struct r600_bytecode_alu alu;
8836	int i, j, k, r;
8837	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8838
8839	/* src0 * src1 */
8840	for (i = 0; i < lasti + 1; i++) {
8841		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8842			continue;
8843
8844		if (ctx->bc->chip_class == CAYMAN) {
8845			for (j = 0 ; j < 4; j++) {
8846				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8847
8848				alu.op = ALU_OP2_MULLO_UINT;
8849				for (k = 0; k < inst->Instruction.NumSrcRegs; k++) {
8850					r600_bytecode_src(&alu.src[k], &ctx->src[k], i);
8851				}
8852				alu.dst.chan = j;
8853				alu.dst.sel = ctx->temp_reg;
8854				alu.dst.write = (j == i);
8855				if (j == 3)
8856					alu.last = 1;
8857				r = r600_bytecode_add_alu(ctx->bc, &alu);
8858				if (r)
8859					return r;
8860			}
8861		} else {
8862			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8863
8864			alu.dst.chan = i;
8865			alu.dst.sel = ctx->temp_reg;
8866			alu.dst.write = 1;
8867
8868			alu.op = ALU_OP2_MULLO_UINT;
8869			for (j = 0; j < 2; j++) {
8870				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
8871			}
8872
8873			alu.last = 1;
8874			r = r600_bytecode_add_alu(ctx->bc, &alu);
8875			if (r)
8876				return r;
8877		}
8878	}
8879
8880
8881	for (i = 0; i < lasti + 1; i++) {
8882		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8883			continue;
8884
8885		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8886		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8887
8888		alu.op = ALU_OP2_ADD_INT;
8889
8890		alu.src[0].sel = ctx->temp_reg;
8891		alu.src[0].chan = i;
8892
8893		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
8894		if (i == lasti) {
8895			alu.last = 1;
8896		}
8897		r = r600_bytecode_add_alu(ctx->bc, &alu);
8898		if (r)
8899			return r;
8900	}
8901	return 0;
8902}
8903
8904static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
8905	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_r600_arl},
8906	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
8907	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
8908
8909	/* XXX:
8910	 * For state trackers other than OpenGL, we'll want to use
8911	 * _RECIP_IEEE instead.
8912	 */
8913	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_CLAMPED, tgsi_trans_srcx_replicate},
8914
8915	[TGSI_OPCODE_RSQ]	= { ALU_OP0_NOP, tgsi_rsq},
8916	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
8917	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
8918	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL, tgsi_op2},
8919	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
8920	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4, tgsi_dp},
8921	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4, tgsi_dp},
8922	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
8923	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN, tgsi_op2},
8924	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX, tgsi_op2},
8925	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
8926	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
8927	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD, tgsi_op3},
8928	[TGSI_OPCODE_SUB]	= { ALU_OP2_ADD, tgsi_op2},
8929	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
8930	[TGSI_OPCODE_FMA]	= { ALU_OP0_NOP, tgsi_unsupported},
8931	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
8932	[TGSI_OPCODE_DP2A]	= { ALU_OP0_NOP, tgsi_unsupported},
8933	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
8934	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
8935	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
8936	[TGSI_OPCODE_CLAMP]	= { ALU_OP0_NOP, tgsi_unsupported},
8937	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
8938	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
8939	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
8940	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
8941	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
8942	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
8943	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
8944	[TGSI_OPCODE_ABS]	= { ALU_OP1_MOV, tgsi_op2},
8945	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
8946	[TGSI_OPCODE_DPH]	= { ALU_OP2_DOT4, tgsi_dp},
8947	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
8948	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8949	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8950	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
8951	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_unsupported},
8952	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
8953	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
8954	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
8955	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
8956	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
8957	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
8958	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
8959	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, tgsi_trig},
8960	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
8961	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
8962	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
8963	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
8964	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
8965	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
8966	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_unsupported},
8967	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
8968	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
8969	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
8970	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
8971	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
8972	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_r600_arl},
8973	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
8974	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
8975	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
8976	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
8977	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
8978	[TGSI_OPCODE_SCS]	= { ALU_OP0_NOP, tgsi_scs},
8979	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
8980	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
8981	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
8982	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4, tgsi_dp},
8983	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
8984	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
8985	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
8986	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
8987	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
8988	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
8989	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
8990	[TGSI_OPCODE_DDX_FINE]	= { ALU_OP0_NOP, tgsi_unsupported},
8991	[TGSI_OPCODE_DDY_FINE]	= { ALU_OP0_NOP, tgsi_unsupported},
8992	[TGSI_OPCODE_PUSHA]	= { ALU_OP0_NOP, tgsi_unsupported},
8993	[TGSI_OPCODE_POPA]	= { ALU_OP0_NOP, tgsi_unsupported},
8994	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
8995	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
8996	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
8997	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
8998	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2_trans},
8999	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
9000	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
9001	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
9002	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
9003	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
9004	[TGSI_OPCODE_SAD]	= { ALU_OP0_NOP, tgsi_unsupported},
9005	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
9006	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
9007	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
9008	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
9009	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
9010	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
9011	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
9012	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
9013	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
9014	[TGSI_OPCODE_TXQ_LZ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
9015	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
9016	[105]			= { ALU_OP0_NOP, tgsi_unsupported},
9017	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
9018	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
9019	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
9020	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
9021	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
9022	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
9023	[112]			= { ALU_OP0_NOP, tgsi_unsupported},
9024	[TGSI_OPCODE_CALLNZ]	= { ALU_OP0_NOP, tgsi_unsupported},
9025	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
9026	[TGSI_OPCODE_BREAKC]	= { ALU_OP0_NOP, tgsi_loop_breakc},
9027	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
9028	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
9029	[118]			= { ALU_OP0_NOP, tgsi_unsupported},
9030	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
9031	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
9032	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
9033	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
9034	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
9035	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
9036	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2_trans},
9037	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
9038	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
9039	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
9040	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
9041	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
9042	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
9043	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
9044	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
9045	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
9046	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
9047	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
9048	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
9049	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2_trans},
9050	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
9051	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2_swap},
9052	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
9053	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
9054	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
9055	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
9056	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
9057	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
9058	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
9059	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
9060	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
9061	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
9062	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
9063	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
9064	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
9065	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
9066	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
9067	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
9068	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_r600_arl},
9069	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
9070	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
9071	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
9072	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
9073	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
9074	[TGSI_OPCODE_MFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
9075	[TGSI_OPCODE_LFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
9076	[TGSI_OPCODE_SFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
9077	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_NOP, tgsi_unsupported},
9078	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
9079	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
9080	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
9081	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
9082	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
9083	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
9084	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
9085	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
9086	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
9087	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
9088	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
9089	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
9090	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
9091	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, tgsi_op2_trans},
9092	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
9093	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_unsupported},
9094	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_unsupported},
9095	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_unsupported},
9096	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_unsupported},
9097	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_unsupported},
9098	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_unsupported},
9099	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_unsupported},
9100	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_unsupported},
9101	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_unsupported},
9102	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_unsupported},
9103	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_unsupported},
9104	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_unsupported},
9105	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_unsupported},
9106	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
9107};
9108
9109static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
9110	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_eg_arl},
9111	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
9112	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
9113	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
9114	[TGSI_OPCODE_RSQ]	= { ALU_OP1_RECIPSQRT_IEEE, tgsi_rsq},
9115	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
9116	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
9117	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL, tgsi_op2},
9118	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
9119	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4, tgsi_dp},
9120	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4, tgsi_dp},
9121	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
9122	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN, tgsi_op2},
9123	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX, tgsi_op2},
9124	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
9125	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
9126	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD, tgsi_op3},
9127	[TGSI_OPCODE_SUB]	= { ALU_OP2_ADD, tgsi_op2},
9128	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
9129	[TGSI_OPCODE_FMA]	= { ALU_OP0_NOP, tgsi_unsupported},
9130	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
9131	[TGSI_OPCODE_DP2A]	= { ALU_OP0_NOP, tgsi_unsupported},
9132	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
9133	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
9134	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
9135	[TGSI_OPCODE_CLAMP]	= { ALU_OP0_NOP, tgsi_unsupported},
9136	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
9137	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
9138	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
9139	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
9140	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
9141	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
9142	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
9143	[TGSI_OPCODE_ABS]	= { ALU_OP1_MOV, tgsi_op2},
9144	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
9145	[TGSI_OPCODE_DPH]	= { ALU_OP2_DOT4, tgsi_dp},
9146	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
9147	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
9148	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
9149	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
9150	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_unsupported},
9151	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
9152	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
9153	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
9154	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
9155	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
9156	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
9157	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
9158	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, tgsi_trig},
9159	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
9160	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
9161	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
9162	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
9163	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
9164	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
9165	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_unsupported},
9166	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
9167	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
9168	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
9169	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
9170	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
9171	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_eg_arl},
9172	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
9173	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
9174	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
9175	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
9176	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
9177	[TGSI_OPCODE_SCS]	= { ALU_OP0_NOP, tgsi_scs},
9178	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
9179	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
9180	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
9181	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4, tgsi_dp},
9182	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
9183	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
9184	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
9185	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
9186	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
9187	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
9188	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
9189	[TGSI_OPCODE_DDX_FINE]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
9190	[TGSI_OPCODE_DDY_FINE]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
9191	[TGSI_OPCODE_PUSHA]	= { ALU_OP0_NOP, tgsi_unsupported},
9192	[TGSI_OPCODE_POPA]	= { ALU_OP0_NOP, tgsi_unsupported},
9193	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
9194	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
9195	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
9196	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
9197	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2},
9198	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
9199	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
9200	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
9201	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
9202	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
9203	[TGSI_OPCODE_SAD]	= { ALU_OP0_NOP, tgsi_unsupported},
9204	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
9205	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
9206	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
9207	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
9208	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
9209	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
9210	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
9211	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
9212	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
9213	[TGSI_OPCODE_TXQ_LZ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
9214	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
9215	[105]			= { ALU_OP0_NOP, tgsi_unsupported},
9216	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
9217	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
9218	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
9219	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
9220	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
9221	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
9222	[112]			= { ALU_OP0_NOP, tgsi_unsupported},
9223	[TGSI_OPCODE_CALLNZ]	= { ALU_OP0_NOP, tgsi_unsupported},
9224	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
9225	[TGSI_OPCODE_BREAKC]	= { ALU_OP0_NOP, tgsi_unsupported},
9226	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
9227	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
9228	[118]			= { ALU_OP0_NOP, tgsi_unsupported},
9229	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_f2i},
9230	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
9231	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
9232	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
9233	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
9234	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
9235	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2},
9236	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
9237	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
9238	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
9239	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
9240	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
9241	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
9242	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
9243	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
9244	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
9245	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
9246	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
9247	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
9248	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2},
9249	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
9250	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2},
9251	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
9252	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
9253	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
9254	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
9255	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
9256	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
9257	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
9258	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
9259	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
9260	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
9261	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
9262	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
9263	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
9264	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
9265	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
9266	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
9267	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_eg_arl},
9268	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
9269	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
9270	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
9271	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
9272	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
9273	[TGSI_OPCODE_MFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
9274	[TGSI_OPCODE_LFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
9275	[TGSI_OPCODE_SFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
9276	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_NOP, tgsi_unsupported},
9277	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
9278	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
9279	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
9280	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
9281	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
9282	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
9283	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
9284	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
9285	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
9286	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
9287	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
9288	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
9289	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
9290	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, tgsi_op2_trans},
9291	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
9292	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_tex},
9293	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_tex},
9294	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_op3},
9295	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_op3},
9296	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_bfi},
9297	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_op2},
9298	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_op2},
9299	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_op2},
9300	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_msb},
9301	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_msb},
9302	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
9303	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
9304	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
9305	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
9306	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
9307	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
9308	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
9309	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
9310	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
9311	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
9312	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
9313	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
9314	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
9315	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
9316	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
9317	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
9318	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
9319	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
9320	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
9321	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
9322	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
9323	[TGSI_OPCODE_D2I]	= { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
9324	[TGSI_OPCODE_I2D]	= { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
9325	[TGSI_OPCODE_D2U]	= { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
9326	[TGSI_OPCODE_U2D]	= { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
9327	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
9328	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
9329};
9330
9331static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
9332	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_eg_arl},
9333	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
9334	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
9335	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
9336	[TGSI_OPCODE_RSQ]	= { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
9337	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
9338	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
9339	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL, tgsi_op2},
9340	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
9341	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4, tgsi_dp},
9342	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4, tgsi_dp},
9343	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
9344	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN, tgsi_op2},
9345	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX, tgsi_op2},
9346	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
9347	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
9348	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD, tgsi_op3},
9349	[TGSI_OPCODE_SUB]	= { ALU_OP2_ADD, tgsi_op2},
9350	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
9351	[TGSI_OPCODE_FMA]	= { ALU_OP0_NOP, tgsi_unsupported},
9352	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
9353	[TGSI_OPCODE_DP2A]	= { ALU_OP0_NOP, tgsi_unsupported},
9354	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
9355	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
9356	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
9357	[TGSI_OPCODE_CLAMP]	= { ALU_OP0_NOP, tgsi_unsupported},
9358	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
9359	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
9360	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
9361	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
9362	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, cayman_pow},
9363	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
9364	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
9365	[TGSI_OPCODE_ABS]	= { ALU_OP1_MOV, tgsi_op2},
9366	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
9367	[TGSI_OPCODE_DPH]	= { ALU_OP2_DOT4, tgsi_dp},
9368	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, cayman_trig},
9369	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
9370	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
9371	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
9372	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_unsupported},
9373	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
9374	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
9375	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
9376	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
9377	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
9378	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
9379	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
9380	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, cayman_trig},
9381	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
9382	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
9383	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
9384	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
9385	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
9386	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
9387	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_unsupported},
9388	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
9389	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
9390	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
9391	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
9392	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
9393	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_eg_arl},
9394	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
9395	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
9396	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
9397	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
9398	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
9399	[TGSI_OPCODE_SCS]	= { ALU_OP0_NOP, tgsi_scs},
9400	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
9401	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
9402	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
9403	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4, tgsi_dp},
9404	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
9405	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
9406	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
9407	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
9408	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
9409	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
9410	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
9411	[TGSI_OPCODE_DDX_FINE]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
9412	[TGSI_OPCODE_DDY_FINE]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
9413	[TGSI_OPCODE_PUSHA]	= { ALU_OP0_NOP, tgsi_unsupported},
9414	[TGSI_OPCODE_POPA]	= { ALU_OP0_NOP, tgsi_unsupported},
9415	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
9416	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2},
9417	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
9418	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
9419	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2},
9420	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
9421	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
9422	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
9423	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
9424	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
9425	[TGSI_OPCODE_SAD]	= { ALU_OP0_NOP, tgsi_unsupported},
9426	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
9427	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
9428	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
9429	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
9430	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
9431	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
9432	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
9433	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
9434	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
9435	[TGSI_OPCODE_TXQ_LZ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
9436	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
9437	[105]			= { ALU_OP0_NOP, tgsi_unsupported},
9438	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
9439	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
9440	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
9441	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
9442	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
9443	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
9444	[112]			= { ALU_OP0_NOP, tgsi_unsupported},
9445	[TGSI_OPCODE_CALLNZ]	= { ALU_OP0_NOP, tgsi_unsupported},
9446	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
9447	[TGSI_OPCODE_BREAKC]	= { ALU_OP0_NOP, tgsi_unsupported},
9448	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
9449	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
9450	[118]			= { ALU_OP0_NOP, tgsi_unsupported},
9451	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_op2},
9452	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
9453	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
9454	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
9455	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
9456	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
9457	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2},
9458	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
9459	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_op2},
9460	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2},
9461	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
9462	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
9463	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
9464	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
9465	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
9466	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
9467	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
9468	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
9469	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
9470	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2},
9471	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
9472	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2},
9473	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
9474	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
9475	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
9476	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
9477	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
9478	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
9479	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
9480	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
9481	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
9482	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
9483	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
9484	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
9485	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
9486	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
9487	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
9488	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
9489	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_eg_arl},
9490	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
9491	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
9492	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
9493	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
9494	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
9495	[TGSI_OPCODE_MFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
9496	[TGSI_OPCODE_LFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
9497	[TGSI_OPCODE_SFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
9498	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_NOP, tgsi_unsupported},
9499	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
9500	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
9501	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
9502	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
9503	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
9504	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
9505	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
9506	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
9507	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
9508	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
9509	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
9510	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
9511	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
9512	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
9513	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
9514	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_tex},
9515	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_tex},
9516	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_op3},
9517	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_op3},
9518	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_bfi},
9519	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_op2},
9520	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_op2},
9521	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_op2},
9522	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_msb},
9523	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_msb},
9524	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
9525	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
9526	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
9527	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
9528	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
9529	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
9530	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
9531	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
9532	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
9533	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
9534	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
9535	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
9536	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
9537	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
9538	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
9539	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
9540	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
9541	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
9542	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
9543	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
9544	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
9545	[TGSI_OPCODE_D2I]	= { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
9546	[TGSI_OPCODE_I2D]	= { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
9547	[TGSI_OPCODE_D2U]	= { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
9548	[TGSI_OPCODE_U2D]	= { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
9549	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
9550	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
9551};
9552